Skip to content

Commit

Permalink
zuc: minor optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Nov 8, 2024
1 parent a33c2ae commit b721bed
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 41 deletions.
33 changes: 13 additions & 20 deletions zuc/asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ DATA Low_nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA Low_nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL Low_nibble_mask<>(SB), RODATA, $16

DATA High_nibble_mask<>+0x00(SB)/8, $0xF0F0F0F0F0F0F0F0
DATA High_nibble_mask<>+0x08(SB)/8, $0xF0F0F0F0F0F0F0F0
GLOBL High_nibble_mask<>(SB), RODATA, $16

DATA P1<>+0x00(SB)/8, $0x0A020F0F0E000F09
DATA P1<>+0x08(SB)/8, $0x090305070C000400
GLOBL P1<>(SB), RODATA, $16
Expand Down Expand Up @@ -99,10 +95,9 @@ GLOBL flip_mask<>(SB), RODATA, $16
#define S0_comput_SSE(IN_OUT, XTMP1, XTMP2) \
MOVOU IN_OUT, XTMP1 \
\
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
PAND High_nibble_mask<>(SB), XTMP1 \
PSRLQ $4, XTMP1 \ // x1
PAND Low_nibble_mask<>(SB), XTMP1 \
PAND Low_nibble_mask<>(SB), IN_OUT \ // x2
\
MOVOU P1<>(SB), XTMP2 \
PSHUFB IN_OUT, XTMP2 \ // P1[x2]
Expand All @@ -124,16 +119,15 @@ GLOBL flip_mask<>(SB), RODATA, $16
// for high and low nible of each input byte, SSE versiion.
#define MUL_PSHUFB_SSE(XIN, XLO, XHI_OUT, XTMP) \
\ // Get low nibble of input data
MOVOU Low_nibble_mask<>(SB), XTMP \
PAND XIN, XTMP \
MOVOU XIN, XTMP \
PAND Low_nibble_mask<>(SB), XTMP \
\ // Get low nibble of output
PSHUFB XTMP, XLO \
\ // Get high nibble of input data
MOVOU High_nibble_mask<>(SB), XTMP \
PAND XIN, XTMP \
PSRLQ $4, XTMP \
PSRLQ $4, XIN \
PAND Low_nibble_mask<>(SB), XIN \
\ // Get high nibble of output
PSHUFB XTMP, XHI_OUT \
PSHUFB XIN, XHI_OUT \
\ // XOR high and low nibbles to get full bytes
PXOR XLO, XHI_OUT

Expand All @@ -146,8 +140,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
PSHUFB Shuf_mask<>(SB), XTMP2 \
AESENCLAST Cancel_aes<>(SB), XTMP2 \
\
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
MOVOU Comb_matrix_mul_low_nibble<>(SB), XTMP1 \
MOVOU Comb_matrix_mul_high_nibble<>(SB), XIN_OUT \
MUL_PSHUFB_SSE(XTMP2, XTMP1, XIN_OUT, XTMP3)

// Rotate left 5 bits in each byte, within an XMM register, AVX version.
Expand All @@ -160,9 +154,8 @@ GLOBL flip_mask<>(SB), RODATA, $16

// Compute 16 S0 box values from 16 bytes, AVX version.
#define S0_comput_AVX(IN_OUT, XTMP1, XTMP2) \
VPAND High_nibble_mask<>(SB), IN_OUT, XTMP1 \
VPSRLQ $4, XTMP1, XTMP1 \ // x1
\
VPSRLQ $4, IN_OUT, XTMP1 \ // x1
VPAND Low_nibble_mask<>(SB), XTMP1, XTMP1 \
VPAND Low_nibble_mask<>(SB), IN_OUT, IN_OUT \ // x2
\
VMOVDQU P1<>(SB), XTMP2 \
Expand All @@ -189,8 +182,8 @@ GLOBL flip_mask<>(SB), RODATA, $16
\ // Get low nibble of output
VPSHUFB XTMP, XLO, XLO \
\ // Get high nibble of input data
VPAND High_nibble_mask<>(SB), XIN, XTMP \
VPSRLQ $4, XTMP, XTMP \
VPSRLQ $4, XIN, XTMP \
VPAND Low_nibble_mask<>(SB), XTMP, XTMP \
\ // Get high nibble of output
VPSHUFB XTMP, XHI_OUT, XHI_OUT \
\ // XOR high and low nibbles to get full bytes
Expand Down
4 changes: 1 addition & 3 deletions zuc/asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ GLOBL mask_S01<>(SB), RODATA, $32
VORR XTMP0.B16, XDATA.B16, XDATA.B16

#define S0_comput(IN_OUT, XTMP1, XTMP2) \
VUSHR $4, IN_OUT.S4, XTMP1.S4 \
VAND NIBBLE_MASK.B16, XTMP1.B16, XTMP1.B16 \
\
VUSHR $4, IN_OUT.B16, XTMP1.B16 \
VAND NIBBLE_MASK.B16, IN_OUT.B16, IN_OUT.B16 \
\
VTBL IN_OUT.B16, [P1.B16], XTMP2.B16 \
Expand Down
15 changes: 6 additions & 9 deletions zuc/asm_ppc64x.s
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,8 @@ GLOBL rcon<>(SB), RODATA, $160
LXVD2X (R4)(R5), S1_MASK

#define S0_comput(IN_OUT, V_FOUR, XTMP1, XTMP2) \
VSRW IN_OUT, V_FOUR, XTMP1; \
VAND XTMP1, NIBBLE_MASK, XTMP1; \
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \
VSRB IN_OUT, V_FOUR, XTMP1; \ // XTMP1 = hi 4 bits of IN_OUT
VAND IN_OUT, NIBBLE_MASK, IN_OUT; \ // low 4 bits of IN_OUT
VPERM P1, P1, IN_OUT, XTMP2; \
VXOR XTMP1, XTMP2, XTMP2; \
VPERM P2, P2, XTMP2, XTMP1; \
Expand All @@ -87,8 +86,6 @@ GLOBL rcon<>(SB), RODATA, $160
// zuc sbox function
// parameters:
// - x: 128 bits register as sbox input/output data
// - y: 128 bits temp register
// - z: 128 bits temp register
#define S1_comput(x, y, z) \
VPERMXOR M1H, M1L, x, x; \
VSBOX x, x; \
Expand Down Expand Up @@ -213,7 +210,7 @@ GLOBL rcon<>(SB), RODATA, $160
\ // LFSR_S16 = (LFSR_S15++) = W
MOVW W, (((0 + idx) % 16)*4)(addr)

#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
#define RESTORE_LFSR_0(addr, tmpR1, tmpR2, tmpR3, tmpR4) \
MOVWZ (addr), tmpR1 \
MOVD $4, tmpR4 \
LXVD2X (tmpR4)(addr), V0 \
Expand All @@ -232,7 +229,7 @@ GLOBL rcon<>(SB), RODATA, $160
MOVW tmpR3, 56(addr) \
MOVW tmpR1, 60(addr)

#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
#define RESTORE_LFSR_2(addr, tmpR1, tmpR2, tmpR3) \
MOVD (addr), tmpR1 \
MOVD $8, tmpR2 \
LXVD2X (tmpR2)(addr), V0 \
Expand All @@ -250,7 +247,7 @@ GLOBL rcon<>(SB), RODATA, $160
MOVD tmpR3, 48(addr) \
MOVD tmpR1, 56(addr)

#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
#define RESTORE_LFSR_4(addr, tmpR1, tmpR2, tmpR3) \
LXVD2X (addr), V0 \
MOVD $16, tmpR1 \
LXVD2X (tmpR1)(addr), V1 \
Expand All @@ -264,7 +261,7 @@ GLOBL rcon<>(SB), RODATA, $160
STXVD2X V3, (tmpR2)(addr) \
STXVD2X V0, (tmpR3)(addr)

#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
#define RESTORE_LFSR_8(addr, tmpR1, tmpR2, tmpR3) \
LXVD2X (addr), V0 \
MOVD $16, tmpR1 \
LXVD2X (tmpR1)(addr), V1 \
Expand Down
8 changes: 8 additions & 0 deletions zuc/core_asm.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

package zuc

import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)

// Generate single keyword, 4 bytes.
//
//go:noescape
Expand All @@ -12,6 +17,9 @@ func genKeywordAsm(s *zucState32) uint32
//go:noescape
func genKeyStreamAsm(keyStream []uint32, pState *zucState32)

var supportsAES = cpuid.HasAES
var useAVX = cpu.X86.HasAVX

func genKeyStream(keyStream []uint32, pState *zucState32) {
if supportsAES {
genKeyStreamAsm(keyStream, pState)
Expand Down
6 changes: 2 additions & 4 deletions zuc/eia256_asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ TEXT ·eia256RoundTag8(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16

VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
Expand Down Expand Up @@ -115,8 +114,7 @@ TEXT ·eia256RoundTag16(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16

VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
Expand Down
3 changes: 0 additions & 3 deletions zuc/eia_asm.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@ package zuc

import (
"github.com/emmansun/gmsm/internal/cpuid"
"golang.org/x/sys/cpu"
)

var supportsAES = cpuid.HasAES
var supportsGFMUL = cpuid.HasGFMUL || cpuid.HasVPMSUMD
var useAVX = cpu.X86.HasAVX

//go:noescape
func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int)
Expand Down
3 changes: 1 addition & 2 deletions zuc/eia_asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ TEXT ·eia3Round16B(SB),NOSPLIT,$0
// Reverse data bytes
VLD1 (CX), [XDATA.B16]
VAND BIT_REV_AND_TAB.B16, XDATA.B16, XTMP3.B16
VUSHR $4, XDATA.S4, XTMP1.S4
VAND BIT_REV_AND_TAB.B16, XTMP1.B16, XTMP1.B16
VUSHR $4, XDATA.B16, XTMP1.B16

VTBL XTMP3.B16, [BIT_REV_TAB_H.B16], XTMP3.B16
VTBL XTMP1.B16, [BIT_REV_TAB_L.B16], XTMP1.B16
Expand Down

0 comments on commit b721bed

Please sign in to comment.