mirror of
https://github.com/emmansun/gmsm.git
synced 2025-04-28 13:16:19 +08:00
sm3: multi-blocks refactoring
This commit is contained in:
parent
2c688bb9d3
commit
1cf81a8e7c
@ -86,6 +86,16 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
// load 256 bits
|
||||
#define loadWord(W, i) VMOVDQU (256+(i)*32)(BX), W
|
||||
|
||||
#define REV32(a, b, c, d, e, f, g, h) \
|
||||
VPSHUFB flip_mask<>(SB), a, a; \
|
||||
VPSHUFB flip_mask<>(SB), b, b; \
|
||||
VPSHUFB flip_mask<>(SB), c, c; \
|
||||
VPSHUFB flip_mask<>(SB), d, d; \
|
||||
VPSHUFB flip_mask<>(SB), e, e; \
|
||||
VPSHUFB flip_mask<>(SB), f, f; \
|
||||
VPSHUFB flip_mask<>(SB), g, g; \
|
||||
VPSHUFB flip_mask<>(SB), h, h
|
||||
|
||||
#define prepare8Words(i) \
|
||||
VMOVDQU (i*32)(srcPtr1), a; \
|
||||
VMOVDQU (i*32)(srcPtr2), b; \
|
||||
@ -97,14 +107,7 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
VMOVDQU (i*32)(srcPtr8), h; \
|
||||
; \
|
||||
TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4); \
|
||||
VPSHUFB flip_mask<>(SB), a, a; \
|
||||
VPSHUFB flip_mask<>(SB), b, b; \
|
||||
VPSHUFB flip_mask<>(SB), c, c; \
|
||||
VPSHUFB flip_mask<>(SB), d, d; \
|
||||
VPSHUFB flip_mask<>(SB), e, e; \
|
||||
VPSHUFB flip_mask<>(SB), f, f; \
|
||||
VPSHUFB flip_mask<>(SB), g, g; \
|
||||
VPSHUFB flip_mask<>(SB), h, h; \
|
||||
REV32(a, b, c, d, e, f, g, h); \
|
||||
; \
|
||||
storeWord(a, 8*i+0); \
|
||||
storeWord(b, 8*i+1); \
|
||||
@ -115,25 +118,25 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
storeWord(g, 8*i+6); \
|
||||
storeWord(h, 8*i+7)
|
||||
|
||||
#define saveState \
|
||||
VMOVDQU a, (0*32)(BX); \
|
||||
VMOVDQU b, (1*32)(BX); \
|
||||
VMOVDQU c, (2*32)(BX); \
|
||||
VMOVDQU d, (3*32)(BX); \
|
||||
VMOVDQU e, (4*32)(BX); \
|
||||
VMOVDQU f, (5*32)(BX); \
|
||||
VMOVDQU g, (6*32)(BX); \
|
||||
VMOVDQU h, (7*32)(BX)
|
||||
#define saveState(R) \
|
||||
VMOVDQU a, (0*32)(R); \
|
||||
VMOVDQU b, (1*32)(R); \
|
||||
VMOVDQU c, (2*32)(R); \
|
||||
VMOVDQU d, (3*32)(R); \
|
||||
VMOVDQU e, (4*32)(R); \
|
||||
VMOVDQU f, (5*32)(R); \
|
||||
VMOVDQU g, (6*32)(R); \
|
||||
VMOVDQU h, (7*32)(R)
|
||||
|
||||
#define loadState \
|
||||
VMOVDQU (0*32)(BX), a; \
|
||||
VMOVDQU (1*32)(BX), b; \
|
||||
VMOVDQU (2*32)(BX), c; \
|
||||
VMOVDQU (3*32)(BX), d; \
|
||||
VMOVDQU (4*32)(BX), e; \
|
||||
VMOVDQU (5*32)(BX), f; \
|
||||
VMOVDQU (6*32)(BX), g; \
|
||||
VMOVDQU (7*32)(BX), h
|
||||
#define loadState(R) \
|
||||
VMOVDQU (0*32)(R), a; \
|
||||
VMOVDQU (1*32)(R), b; \
|
||||
VMOVDQU (2*32)(R), c; \
|
||||
VMOVDQU (3*32)(R), d; \
|
||||
VMOVDQU (4*32)(R), e; \
|
||||
VMOVDQU (5*32)(R), f; \
|
||||
VMOVDQU (6*32)(R), g; \
|
||||
VMOVDQU (7*32)(R), h
|
||||
|
||||
// r <<< n
|
||||
#define VPROLD(r, n) \
|
||||
@ -150,16 +153,49 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
#define LOAD_T(index, T) \
|
||||
VPBROADCASTD (index*4)(AX), T
|
||||
|
||||
// DST = X XOR Y XOR Z
|
||||
#define FF0(X, Y, Z, DST) \
|
||||
VPXOR X, Y, DST; \
|
||||
VPXOR Z, DST, DST
|
||||
|
||||
// DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
|
||||
#define FF1(X, Y, Z, TMP, DST) \
|
||||
VPOR X, Y, DST; \
|
||||
VPAND X, Y, TMP; \
|
||||
VPAND Z, DST, DST; \
|
||||
VPOR TMP, DST, DST
|
||||
|
||||
// DST = X XOR Y XOR Z
|
||||
#define GG0(X, Y, Z, DST) \
|
||||
FF0(X, Y, Z, DST)
|
||||
|
||||
// DST = (Y XOR Z) AND X XOR Z
|
||||
#define GG1(X, Y, Z, DST) \
|
||||
VPXOR Y, Z, DST; \
|
||||
VPAND X, DST, DST; \
|
||||
VPXOR Z, DST, DST
|
||||
|
||||
#define SS1SS2(index, a, e, SS1, SS2) \
|
||||
VPROLD2(a, SS2, 12); \ // a <<< 12
|
||||
LOAD_T(index, SS1); \ // const
|
||||
VPADDD SS1, SS2, SS1; \
|
||||
VPADDD e, SS1, SS1; \
|
||||
VPROLD(SS1, 7); \ // SS1
|
||||
VPXOR SS1, SS2, SS2; \ // SS2
|
||||
|
||||
#define COPY_RESULT(b, d, f, h, TT1, TT2) \
|
||||
VPROLD(b, 9); \
|
||||
VMOVDQU TT1, h; \ // TT1
|
||||
VPROLD(f, 19); \
|
||||
VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
|
||||
VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
|
||||
VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
|
||||
VPXOR TT1, TT2, d
|
||||
|
||||
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
|
||||
VPROLD2(a, Y13, 12); \ // a <<< 12
|
||||
LOAD_T(index, Y12); \
|
||||
VPADDD Y12, Y13, Y12; \
|
||||
VPADDD e, Y12, Y12; \
|
||||
VPROLD(Y12, 7); \ // SS1
|
||||
VPXOR Y12, Y13, Y13; \ // SS2
|
||||
SS1SS2(index, a, e, Y12, Y13); \
|
||||
; \
|
||||
VPXOR a, b, Y14; \
|
||||
VPXOR c, Y14, Y14; \ // (a XOR b XOR c)
|
||||
FF0(a, b, c, Y14); \
|
||||
VPADDD d, Y14, Y14; \ // (a XOR b XOR c) + d
|
||||
loadWord(Y10, index); \
|
||||
loadWord(Y11, index+4); \
|
||||
@ -168,17 +204,10 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
VPADDD Y14, Y13, Y13; \ // TT1
|
||||
VPADDD h, Y10, Y10; \ // Wt + h
|
||||
VPADDD Y12, Y10, Y10; \ // Wt + h + SS1
|
||||
VPXOR e, f, Y11; \
|
||||
VPXOR g, Y11, Y11; \ // (e XOR f XOR g)
|
||||
GG0(e, f, g, Y11); \
|
||||
VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||
; \ // copy result
|
||||
VPROLD(b, 9); \
|
||||
VMOVDQU Y13, h; \
|
||||
VPROLD(f, 19); \
|
||||
VPROLD2(Y10, Y13, 9); \ // tt2 <<< 9
|
||||
VPSHUFB r08_mask<>(SB), Y13, Y11; \ // ROTL(17, tt2)
|
||||
VPXOR Y10, Y13, Y13; \ // tt2 XOR ROTL(9, tt2)
|
||||
VPXOR Y11, Y13, d
|
||||
COPY_RESULT(b, d, f, h, Y13, Y10)
|
||||
|
||||
#define MESSAGE_SCHEDULE(index) \
|
||||
loadWord(Y10, index+1); \ // Wj-3
|
||||
@ -202,17 +231,9 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
|
||||
#define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
|
||||
MESSAGE_SCHEDULE(index); \ // Y11 is Wt+4 now, Pls do not use it
|
||||
VPROLD2(a, Y13, 12); \ // a <<< 12
|
||||
LOAD_T(index, Y12); \
|
||||
VPADDD Y12, Y13, Y12; \
|
||||
VPADDD e, Y12, Y12; \
|
||||
VPROLD(Y12, 7); \ // SS1
|
||||
VPXOR Y12, Y13, Y13; \ // SS2
|
||||
SS1SS2(index, a, e, Y12, Y13); \
|
||||
; \
|
||||
VPOR a, b, Y14; \
|
||||
VPAND a, b, Y10; \
|
||||
VPAND c, Y14, Y14; \
|
||||
VPOR Y10, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c)
|
||||
FF1(a, b, c, Y10, Y14); \ // (a AND b) OR (a AND c) OR (b AND c)
|
||||
VPADDD d, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
||||
loadWord(Y10, index); \
|
||||
VPXOR Y10, Y11, Y11; \ //Wt XOR Wt+4
|
||||
@ -221,18 +242,10 @@ GLOBL r08_mask<>(SB), 8, $32
|
||||
; \
|
||||
VPADDD h, Y10, Y10; \ // Wt + h
|
||||
VPADDD Y12, Y10, Y10; \ // Wt + h + SS1
|
||||
VPXOR f, g, Y11; \
|
||||
VPAND e, Y11, Y11; \
|
||||
VPXOR g, Y11, Y11; \ // (f XOR g) AND e XOR g
|
||||
GG1(e, f, g, Y11); \
|
||||
VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||
; \ // copy result
|
||||
VPROLD(b, 9); \
|
||||
VMOVDQU Y13, h; \
|
||||
VPROLD(f, 19); \
|
||||
VPROLD2(Y10, Y13, 9); \ // tt2 <<< 9
|
||||
VPSHUFB r08_mask<>(SB), Y13, Y11; \ // ROTL(17, tt2)
|
||||
VPXOR Y10, Y13, Y13; \ // tt2 XOR ROTL(9, tt2)
|
||||
VPXOR Y11, Y13, d
|
||||
COPY_RESULT(b, d, f, h, Y13, Y10)
|
||||
|
||||
// transposeMatrix8x8(dig **[8]uint32)
|
||||
TEXT ·transposeMatrix8x8(SB),NOSPLIT,$0
|
||||
@ -307,7 +320,7 @@ TEXT ·blockMultBy8(SB),NOSPLIT,$0
|
||||
|
||||
TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4)
|
||||
|
||||
saveState
|
||||
saveState(BX)
|
||||
|
||||
MOVQ $·_K+0(SB), AX
|
||||
MOVQ (0*8)(SI), srcPtr1
|
||||
@ -324,7 +337,7 @@ loop:
|
||||
prepare8Words(1)
|
||||
|
||||
// Need to load state again due to YMM registers are used in prepare8Words
|
||||
loadState
|
||||
loadState(BX)
|
||||
|
||||
ROUND_00_11(0, a, b, c, d, e, f, g, h)
|
||||
ROUND_00_11(1, h, a, b, c, d, e, f, g)
|
||||
@ -405,7 +418,7 @@ loop:
|
||||
DECQ DX
|
||||
JZ end
|
||||
|
||||
saveState
|
||||
saveState(BX)
|
||||
LEAQ 64(srcPtr1), srcPtr1
|
||||
LEAQ 64(srcPtr2), srcPtr2
|
||||
LEAQ 64(srcPtr3), srcPtr3
|
||||
@ -446,33 +459,9 @@ TEXT ·copyResultsBy8(SB),NOSPLIT,$0
|
||||
MOVQ dig+0(FP), DI
|
||||
MOVQ dst+8(FP), SI
|
||||
|
||||
// load state
|
||||
VMOVDQU (0*32)(DI), a
|
||||
VMOVDQU (1*32)(DI), b
|
||||
VMOVDQU (2*32)(DI), c
|
||||
VMOVDQU (3*32)(DI), d
|
||||
VMOVDQU (4*32)(DI), e
|
||||
VMOVDQU (5*32)(DI), f
|
||||
VMOVDQU (6*32)(DI), g
|
||||
VMOVDQU (7*32)(DI), h
|
||||
|
||||
VPSHUFB flip_mask<>(SB), a, a
|
||||
VPSHUFB flip_mask<>(SB), b, b
|
||||
VPSHUFB flip_mask<>(SB), c, c
|
||||
VPSHUFB flip_mask<>(SB), d, d
|
||||
VPSHUFB flip_mask<>(SB), e, e
|
||||
VPSHUFB flip_mask<>(SB), f, f
|
||||
VPSHUFB flip_mask<>(SB), g, g
|
||||
VPSHUFB flip_mask<>(SB), h, h
|
||||
|
||||
VMOVDQU a, (0*32)(SI)
|
||||
VMOVDQU b, (1*32)(SI)
|
||||
VMOVDQU c, (2*32)(SI)
|
||||
VMOVDQU d, (3*32)(SI)
|
||||
VMOVDQU e, (4*32)(SI)
|
||||
VMOVDQU f, (5*32)(SI)
|
||||
VMOVDQU g, (6*32)(SI)
|
||||
VMOVDQU h, (7*32)(SI)
|
||||
loadState(DI)
|
||||
REV32(a, b, c, d, e, f, g, h)
|
||||
saveState(SI)
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
@ -69,19 +69,25 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
#define tmp1 X8
|
||||
#define tmp2 X9
|
||||
|
||||
#define storeState \
|
||||
MOVOU a, (BX) \
|
||||
MOVOU b, 16(BX) \
|
||||
MOVOU c, 32(BX) \
|
||||
MOVOU d, 48(BX) \
|
||||
MOVOU e, 64(BX) \
|
||||
MOVOU f, 80(BX) \
|
||||
MOVOU g, 96(BX) \
|
||||
MOVOU h, 112(BX)
|
||||
#define storeState(R) \
|
||||
MOVOU a, (R) \
|
||||
MOVOU b, 16(R) \
|
||||
MOVOU c, 32(R) \
|
||||
MOVOU d, 48(R) \
|
||||
MOVOU e, 64(R) \
|
||||
MOVOU f, 80(R) \
|
||||
MOVOU g, 96(R) \
|
||||
MOVOU h, 112(R)
|
||||
|
||||
#define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
|
||||
#define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
|
||||
|
||||
#define SSE_REV32(a, b, c, d) \
|
||||
PSHUFB flip_mask<>(SB), a; \
|
||||
PSHUFB flip_mask<>(SB), b; \
|
||||
PSHUFB flip_mask<>(SB), c; \
|
||||
PSHUFB flip_mask<>(SB), d
|
||||
|
||||
#define prepare4Words(i) \
|
||||
MOVOU (i*16)(R8), X10; \
|
||||
MOVOU (i*16)(R9), X11; \
|
||||
@ -89,11 +95,7 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
MOVOU (i*16)(R11), X13; \
|
||||
; \
|
||||
SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
|
||||
MOVOU flip_mask<>(SB), tmp1; \
|
||||
PSHUFB tmp1, X10; \
|
||||
PSHUFB tmp1, X11; \
|
||||
PSHUFB tmp1, X12; \
|
||||
PSHUFB tmp1, X13; \
|
||||
SSE_REV32(X10, X11, X12, X13); \
|
||||
; \
|
||||
storeWord(X10, 4*i+0); \
|
||||
storeWord(X11, 4*i+1); \
|
||||
@ -111,18 +113,53 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
PSRLL $(32-n), tmp1; \
|
||||
POR tmp1, r
|
||||
|
||||
#define SSE_SS1SS2(index, a, e, TMP, SS1, SS2) \
|
||||
MOVOU a, SS1; \
|
||||
PROLD(SS1, 12); \
|
||||
MOVOU SS1, SS2; \ // a <<< 12
|
||||
LOAD_T(index, TMP); \
|
||||
PADDL TMP, SS1; \
|
||||
PADDL e, SS1; \
|
||||
PROLD(SS1, 7); \ // SS1
|
||||
PXOR SS1, SS2; \ // SS2
|
||||
|
||||
#define SSE_FF0(X, Y, Z, DST) \
|
||||
MOVOU X, DST; \
|
||||
PXOR Y, DST; \
|
||||
PXOR Z, DST
|
||||
|
||||
#define SSE_FF1(X, Y, Z, TMP, DST) \
|
||||
MOVOU X, DST; \
|
||||
POR Y, DST; \
|
||||
MOVOU X, TMP; \
|
||||
PAND Y, TMP; \
|
||||
PAND Z, DST; \
|
||||
POR TMP, DST; \ // (a AND b) OR (a AND c) OR (b AND c)
|
||||
|
||||
#define SSE_GG0(X, Y, Z, DST) \
|
||||
SSE_FF0(X, Y, Z, DST)
|
||||
|
||||
// DST = (Y XOR Z) AND X XOR Z
|
||||
#define SSE_GG1(X, Y, Z, DST) \
|
||||
MOVOU Y, DST; \
|
||||
PXOR Z, DST; \
|
||||
PAND X, DST; \
|
||||
PXOR Z, DST
|
||||
|
||||
#define SSE_COPY_RESULT(b, d, f, h, TT1, TT2) \
|
||||
PROLD(b, 9); \
|
||||
MOVOU TT1, h; \
|
||||
PROLD(f, 19); \
|
||||
MOVOU TT2, TT1; \
|
||||
PROLD(TT1, 9); \
|
||||
PXOR TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
|
||||
PSHUFB r08_mask<>(SB), TT1; \ // ROTL(17, tt2)
|
||||
PXOR TT2, TT1; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
|
||||
MOVOU TT1, d
|
||||
|
||||
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
|
||||
MOVOU a, X12; \
|
||||
PROLD(X12, 12); \
|
||||
MOVOU X12, X13; \ // a <<< 12
|
||||
LOAD_T(index, tmp2); \
|
||||
PADDL tmp2, X12; \
|
||||
PADDL e, X12; \
|
||||
PROLD(X12, 7); \ // SS1
|
||||
PXOR X12, X13; \ // SS2
|
||||
MOVOU b, X14; \
|
||||
PXOR a, X14; \
|
||||
PXOR c, X14; \ // (a XOR b XOR c)
|
||||
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
|
||||
SSE_FF0(a, b, c, X14); \
|
||||
PADDL d, X14; \ // (a XOR b XOR c) + d
|
||||
loadWord(X10, index); \
|
||||
loadWord(X11, index+4); \
|
||||
@ -131,20 +168,10 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
PADDL X14, X13; \ // TT1
|
||||
PADDL h, X10; \ // Wt + h
|
||||
PADDL X12, X10; \ // Wt + h + SS1
|
||||
MOVOU e, X11; \
|
||||
PXOR f, X11; \
|
||||
PXOR g, X11; \ // (e XOR f XOR g)
|
||||
SSE_GG0(e, f, g, X11); \
|
||||
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||
; \ // copy result
|
||||
PROLD(b, 9); \
|
||||
MOVOU X13, h; \
|
||||
PROLD(f, 19); \
|
||||
MOVOU X10, X13; \
|
||||
PROLD(X13, 9); \
|
||||
PXOR X13, X10; \ // tt2 XOR ROTL(9, tt2)
|
||||
PSHUFB r08_mask<>(SB), X13; \ // ROTL(17, tt2)
|
||||
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
|
||||
MOVOU X13, d
|
||||
SSE_COPY_RESULT(b, d, f, h, X13, X10)
|
||||
|
||||
#define MESSAGE_SCHEDULE(index) \
|
||||
loadWord(X10, index+1); \ // Wj-3
|
||||
@ -171,21 +198,9 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
|
||||
#define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
|
||||
MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
|
||||
MOVOU a, X12; \
|
||||
PROLD(X12, 12); \
|
||||
MOVOU X12, X13; \ // a <<< 12
|
||||
LOAD_T(index, tmp2); \
|
||||
PADDL tmp2, X12; \
|
||||
PADDL e, X12; \
|
||||
PROLD(X12, 7); \ // SS1
|
||||
PXOR X12, X13; \ // SS2
|
||||
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
|
||||
; \
|
||||
MOVOU a, X14; \
|
||||
POR b, X14; \
|
||||
MOVOU a, X10; \
|
||||
PAND b, X10; \
|
||||
PAND c, X14; \
|
||||
POR X10, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
|
||||
SSE_FF1(a, b, c, X10, X14); \
|
||||
PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
||||
loadWord(X10, index); \
|
||||
PXOR X10, X11; \ //Wt XOR Wt+4
|
||||
@ -194,43 +209,48 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
; \
|
||||
PADDL h, X10; \ // Wt + h
|
||||
PADDL X12, X10; \ // Wt + h + SS1
|
||||
MOVOU f, X11; \
|
||||
PXOR g, X11; \
|
||||
PAND e, X11; \ // (f XOR g) AND e XOR g
|
||||
PXOR g, X11; \
|
||||
SSE_GG1(e, f, g, X11); \
|
||||
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||
; \ // copy result
|
||||
PROLD(b, 9); \
|
||||
MOVOU X13, h; \
|
||||
PROLD(f, 19); \
|
||||
MOVOU X10, X13; \
|
||||
PROLD(X13, 9); \
|
||||
PXOR X13, X10; \ // tt2 XOR ROTL(9, tt2)
|
||||
PSHUFB r08_mask<>(SB), X13; \ // ROTL(17, tt2)
|
||||
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
|
||||
MOVOU X13, d
|
||||
SSE_COPY_RESULT(b, d, f, h, X13, X10)
|
||||
|
||||
// transpose matrix function, AVX/AVX2 version
|
||||
// transpose matrix function, AVX version
|
||||
// parameters:
|
||||
// - r0: 128/256 bits register as input/output data
|
||||
// - r1: 128/256 bits register as input/output data
|
||||
// - r2: 128/256 bits register as input/output data
|
||||
// - r3: 128/256 bits register as input/output data
|
||||
// - tmp1: 128/256 bits temp register
|
||||
// - tmp2: 128/256 bits temp register
|
||||
// - r0: 128 bits register as input/output data
|
||||
// - r1: 128 bits register as input/output data
|
||||
// - r2: 128 bits register as input/output data
|
||||
// - r3: 128 bits register as input/output data
|
||||
// - tmp1: 128 bits temp register
|
||||
// - tmp2: 128 bits temp register
|
||||
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
|
||||
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = tmp2 = [w07, w03, w06, w02]
|
||||
VPUNPCKLDQ r1, r0, r0; \ // r0 = r0 = [w05, w01, w04, w00]
|
||||
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = tmp1 = [w13, w09, w12, w08]
|
||||
VPUNPCKHDQ r3, r2, r2; \ // r2 = r2 = [w15, w11, w14, w10]
|
||||
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = r1 = [w13, w09, w05, w01]
|
||||
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = r0 = [w12, w08, w04, w00]
|
||||
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = r3 = [w15, w11, w07, w03]
|
||||
VPUNPCKLQDQ r2, tmp2, r2 // r2 = r2 = [w14, w10, w06, w02]
|
||||
|
||||
#define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
|
||||
#define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
|
||||
|
||||
#define avxStoreState(R) \
|
||||
VMOVDQU a, (0*16)(R) \
|
||||
VMOVDQU b, (1*16)(R) \
|
||||
VMOVDQU c, (2*16)(R) \
|
||||
VMOVDQU d, (3*16)(R) \
|
||||
VMOVDQU e, (4*16)(R) \
|
||||
VMOVDQU f, (5*16)(R) \
|
||||
VMOVDQU g, (6*16)(R) \
|
||||
VMOVDQU h, (7*16)(R)
|
||||
|
||||
#define AVX_REV32(a, b, c, d) \
|
||||
VPSHUFB flip_mask<>(SB), a, a; \
|
||||
VPSHUFB flip_mask<>(SB), b, b; \
|
||||
VPSHUFB flip_mask<>(SB), c, c; \
|
||||
VPSHUFB flip_mask<>(SB), d, d
|
||||
|
||||
#define avxPrepare4Words(i) \
|
||||
VMOVDQU (i*16)(R8), X10; \
|
||||
VMOVDQU (i*16)(R9), X11; \
|
||||
@ -238,10 +258,7 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
VMOVDQU (i*16)(R11), X13; \
|
||||
; \
|
||||
TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
|
||||
VPSHUFB flip_mask<>(SB), X10, X10; \
|
||||
VPSHUFB flip_mask<>(SB), X11, X11; \
|
||||
VPSHUFB flip_mask<>(SB), X12, X12; \
|
||||
VPSHUFB flip_mask<>(SB), X13, X13; \
|
||||
AVX_REV32(X10, X11, X12, X13); \
|
||||
; \
|
||||
avxStoreWord(X10, 4*i+0); \
|
||||
avxStoreWord(X11, 4*i+1); \
|
||||
@ -264,16 +281,49 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
VPSRLD $(32-n), r, d; \
|
||||
VPOR tmp1, d, d
|
||||
|
||||
#define AVX_SS1SS2(index, a, e, SS1, SS2) \
|
||||
VPROLD2(a, SS2, 12); \ // a <<< 12
|
||||
AVX_LOAD_T(index, SS1); \
|
||||
VPADDD SS1, SS2, SS1; \
|
||||
VPADDD e, SS1, SS1; \
|
||||
VPROLD(SS1, 7); \ // SS1
|
||||
VPXOR SS1, SS2, SS2
|
||||
|
||||
// DST = X XOR Y XOR Z
|
||||
#define AVX_FF0(X, Y, Z, DST) \
|
||||
VPXOR X, Y, DST; \
|
||||
VPXOR Z, DST, DST
|
||||
|
||||
// DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
|
||||
#define AVX_FF1(X, Y, Z, TMP, DST) \
|
||||
VPOR X, Y, DST; \
|
||||
VPAND X, Y, TMP; \
|
||||
VPAND Z, DST, DST; \
|
||||
VPOR TMP, DST, DST
|
||||
|
||||
// DST = X XOR Y XOR Z
|
||||
#define AVX_GG0(X, Y, Z, DST) \
|
||||
AVX_FF0(X, Y, Z, DST)
|
||||
|
||||
// DST = (Y XOR Z) AND X XOR Z
|
||||
#define AVX_GG1(X, Y, Z, DST) \
|
||||
VPXOR Y, Z, DST; \
|
||||
VPAND X, DST, DST; \
|
||||
VPXOR Z, DST, DST
|
||||
|
||||
#define AVX_COPY_RESULT(b, d, f, h, TT1, TT2) \
|
||||
VPROLD(b, 9); \
|
||||
VMOVDQU TT1, h; \
|
||||
VPROLD(f, 19); \
|
||||
VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
|
||||
VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
|
||||
VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
|
||||
VPXOR TT2, TT1, d
|
||||
|
||||
#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
|
||||
VPROLD2(a, X13, 12); \ // a <<< 12
|
||||
AVX_LOAD_T(index, X12); \
|
||||
VPADDD X12, X13, X12; \
|
||||
VPADDD e, X12, X12; \
|
||||
VPROLD(X12, 7); \ // SS1
|
||||
VPXOR X12, X13, X13; \ // SS2
|
||||
AVX_SS1SS2(index, a, e, X12, X13); \
|
||||
; \
|
||||
VPXOR a, b, X14; \
|
||||
VPXOR c, X14, X14; \ // (a XOR b XOR c)
|
||||
AVX_FF0(a, b, c, X14); \
|
||||
VPADDD d, X14, X14; \ // (a XOR b XOR c) + d
|
||||
avxLoadWord(X10, index); \
|
||||
avxLoadWord(X11, index+4); \
|
||||
@ -282,17 +332,10 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
VPADDD X14, X13, X13; \ // TT1
|
||||
VPADDD h, X10, X10; \ // Wt + h
|
||||
VPADDD X12, X10, X10; \ // Wt + h + SS1
|
||||
VPXOR e, f, X11; \
|
||||
VPXOR g, X11, X11; \ // (e XOR f XOR g)
|
||||
AVX_GG0(e, f, g, X11); \
|
||||
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||
; \ // copy result
|
||||
VPROLD(b, 9); \
|
||||
VMOVDQU X13, h; \
|
||||
VPROLD(f, 19); \
|
||||
VPROLD2(X10, X13, 9); \ // tt2 <<< 9
|
||||
VPXOR X10, X13, X10; \ // tt2 XOR ROTL(9, tt2)
|
||||
VPSHUFB r08_mask<>(SB), X13, X13; \ // ROTL(17, tt2)
|
||||
VPXOR X10, X13, d
|
||||
AVX_COPY_RESULT(b, d, f, h, X13, X10)
|
||||
|
||||
#define AVX_MESSAGE_SCHEDULE(index) \
|
||||
avxLoadWord(X10, index+1); \ // Wj-3
|
||||
@ -316,17 +359,9 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
|
||||
#define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \
|
||||
AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
|
||||
VPROLD2(a, X13, 12); \ // a <<< 12
|
||||
AVX_LOAD_T(index, X12); \
|
||||
VPADDD X12, X13, X12; \
|
||||
VPADDD e, X12, X12; \
|
||||
VPROLD(X12, 7); \ // SS1
|
||||
VPXOR X12, X13, X13; \ // SS2
|
||||
AVX_SS1SS2(index, a, e, X12, X13); \
|
||||
; \
|
||||
VPOR a, b, X14; \
|
||||
VPAND a, b, X10; \
|
||||
VPAND c, X14, X14; \
|
||||
VPOR X10, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
|
||||
AVX_FF1(a, b, c, X10, X14); \
|
||||
VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
|
||||
avxLoadWord(X10, index); \
|
||||
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
|
||||
@ -335,18 +370,10 @@ GLOBL r08_mask<>(SB), 8, $16
|
||||
; \
|
||||
VPADDD h, X10, X10; \ // Wt + h
|
||||
VPADDD X12, X10, X10; \ // Wt + h + SS1
|
||||
VPXOR f, g, X11; \
|
||||
VPAND e, X11, X11; \
|
||||
VPXOR g, X11, X11; \ // (f XOR g) AND e XOR g
|
||||
AVX_GG1(e, f, g, X11); \
|
||||
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
|
||||
; \ // copy result
|
||||
VPROLD(b, 9); \
|
||||
VMOVDQU X13, h; \
|
||||
VPROLD(f, 19); \
|
||||
VPROLD2(X10, X13, 9); \ // tt2 <<< 9
|
||||
VPXOR X10, X13, X10; \ // tt2 XOR ROTL(9, tt2)
|
||||
VPSHUFB r08_mask<>(SB), X13, X13; \ // ROTL(17, tt2)
|
||||
VPXOR X10, X13, d
|
||||
AVX_COPY_RESULT(b, d, f, h, X13, X10)
|
||||
|
||||
// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
|
||||
TEXT ·blockMultBy4(SB),NOSPLIT,$0
|
||||
@ -377,7 +404,7 @@ TEXT ·blockMultBy4(SB),NOSPLIT,$0
|
||||
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
|
||||
|
||||
// store state to temporary buffer
|
||||
storeState
|
||||
storeState(BX)
|
||||
|
||||
MOVQ $·_K+0(SB), AX
|
||||
MOVQ (SI), R8
|
||||
@ -479,7 +506,7 @@ loop:
|
||||
DECQ DX
|
||||
JZ end
|
||||
|
||||
storeState
|
||||
storeState(BX)
|
||||
LEAQ 64(R8), R8
|
||||
LEAQ 64(R9), R9
|
||||
LEAQ 64(R10), R10
|
||||
@ -525,14 +552,7 @@ avx:
|
||||
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
|
||||
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
|
||||
|
||||
VMOVDQU a, (BX)
|
||||
VMOVDQU b, 16(BX)
|
||||
VMOVDQU c, 32(BX)
|
||||
VMOVDQU d, 48(BX)
|
||||
VMOVDQU e, 64(BX)
|
||||
VMOVDQU f, 80(BX)
|
||||
VMOVDQU g, 96(BX)
|
||||
VMOVDQU h, 112(BX)
|
||||
avxStoreState(BX)
|
||||
|
||||
MOVQ $·_K+0(SB), AX
|
||||
MOVQ (SI), R8
|
||||
@ -627,14 +647,7 @@ avxLoop:
|
||||
JZ avxEnd
|
||||
|
||||
// store current state
|
||||
VMOVDQU a, (0*16)(BX)
|
||||
VMOVDQU b, (1*16)(BX)
|
||||
VMOVDQU c, (2*16)(BX)
|
||||
VMOVDQU d, (3*16)(BX)
|
||||
VMOVDQU e, (4*16)(BX)
|
||||
VMOVDQU f, (5*16)(BX)
|
||||
VMOVDQU g, (6*16)(BX)
|
||||
VMOVDQU h, (7*16)(BX)
|
||||
avxStoreState(BX)
|
||||
|
||||
LEAQ 64(R8), R8
|
||||
LEAQ 64(R9), R9
|
||||
@ -680,23 +693,9 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0
|
||||
MOVOU (6*16)(DI), g
|
||||
MOVOU (7*16)(DI), h
|
||||
|
||||
MOVOU flip_mask<>(SB), tmp1
|
||||
PSHUFB tmp1, a
|
||||
PSHUFB tmp1, b
|
||||
PSHUFB tmp1, c
|
||||
PSHUFB tmp1, d
|
||||
PSHUFB tmp1, e
|
||||
PSHUFB tmp1, f
|
||||
PSHUFB tmp1, g
|
||||
PSHUFB tmp1, h
|
||||
MOVOU a, (0*16)(SI)
|
||||
MOVOU b, (1*16)(SI)
|
||||
MOVOU c, (2*16)(SI)
|
||||
MOVOU d, (3*16)(SI)
|
||||
MOVOU e, (4*16)(SI)
|
||||
MOVOU f, (5*16)(SI)
|
||||
MOVOU g, (6*16)(SI)
|
||||
MOVOU h, (7*16)(SI)
|
||||
SSE_REV32(a, b, c, d)
|
||||
SSE_REV32(e, f, g, h)
|
||||
storeState(SI)
|
||||
|
||||
RET
|
||||
|
||||
@ -711,22 +710,9 @@ avx:
|
||||
VMOVDQU (6*16)(DI), g
|
||||
VMOVDQU (7*16)(DI), h
|
||||
|
||||
VPSHUFB flip_mask<>(SB), a, a
|
||||
VPSHUFB flip_mask<>(SB), b, b
|
||||
VPSHUFB flip_mask<>(SB), c, c
|
||||
VPSHUFB flip_mask<>(SB), d, d
|
||||
VPSHUFB flip_mask<>(SB), e, e
|
||||
VPSHUFB flip_mask<>(SB), f, f
|
||||
VPSHUFB flip_mask<>(SB), g, g
|
||||
VPSHUFB flip_mask<>(SB), h, h
|
||||
AVX_REV32(a, b, c, d)
|
||||
AVX_REV32(e, f, g, h)
|
||||
|
||||
VMOVDQU a, (0*16)(SI)
|
||||
VMOVDQU b, (1*16)(SI)
|
||||
VMOVDQU c, (2*16)(SI)
|
||||
VMOVDQU d, (3*16)(SI)
|
||||
VMOVDQU e, (4*16)(SI)
|
||||
VMOVDQU f, (5*16)(SI)
|
||||
VMOVDQU g, (6*16)(SI)
|
||||
VMOVDQU h, (7*16)(SI)
|
||||
avxStoreState(SI)
|
||||
|
||||
RET
|
||||
|
Loading…
x
Reference in New Issue
Block a user