sm3: multi-blocks refactoring

This commit is contained in:
Sun Yimin 2024-05-28 17:44:12 +08:00 committed by GitHub
parent 2c688bb9d3
commit 1cf81a8e7c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 239 additions and 264 deletions

View File

@ -86,6 +86,16 @@ GLOBL r08_mask<>(SB), 8, $32
// load 256 bits
#define loadWord(W, i) VMOVDQU (256+(i)*32)(BX), W
#define REV32(a, b, c, d, e, f, g, h) \
VPSHUFB flip_mask<>(SB), a, a; \
VPSHUFB flip_mask<>(SB), b, b; \
VPSHUFB flip_mask<>(SB), c, c; \
VPSHUFB flip_mask<>(SB), d, d; \
VPSHUFB flip_mask<>(SB), e, e; \
VPSHUFB flip_mask<>(SB), f, f; \
VPSHUFB flip_mask<>(SB), g, g; \
VPSHUFB flip_mask<>(SB), h, h
#define prepare8Words(i) \
VMOVDQU (i*32)(srcPtr1), a; \
VMOVDQU (i*32)(srcPtr2), b; \
@ -97,14 +107,7 @@ GLOBL r08_mask<>(SB), 8, $32
VMOVDQU (i*32)(srcPtr8), h; \
; \
TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4); \
VPSHUFB flip_mask<>(SB), a, a; \
VPSHUFB flip_mask<>(SB), b, b; \
VPSHUFB flip_mask<>(SB), c, c; \
VPSHUFB flip_mask<>(SB), d, d; \
VPSHUFB flip_mask<>(SB), e, e; \
VPSHUFB flip_mask<>(SB), f, f; \
VPSHUFB flip_mask<>(SB), g, g; \
VPSHUFB flip_mask<>(SB), h, h; \
REV32(a, b, c, d, e, f, g, h); \
; \
storeWord(a, 8*i+0); \
storeWord(b, 8*i+1); \
@ -115,25 +118,25 @@ GLOBL r08_mask<>(SB), 8, $32
storeWord(g, 8*i+6); \
storeWord(h, 8*i+7)
#define saveState \
VMOVDQU a, (0*32)(BX); \
VMOVDQU b, (1*32)(BX); \
VMOVDQU c, (2*32)(BX); \
VMOVDQU d, (3*32)(BX); \
VMOVDQU e, (4*32)(BX); \
VMOVDQU f, (5*32)(BX); \
VMOVDQU g, (6*32)(BX); \
VMOVDQU h, (7*32)(BX)
#define saveState(R) \
VMOVDQU a, (0*32)(R); \
VMOVDQU b, (1*32)(R); \
VMOVDQU c, (2*32)(R); \
VMOVDQU d, (3*32)(R); \
VMOVDQU e, (4*32)(R); \
VMOVDQU f, (5*32)(R); \
VMOVDQU g, (6*32)(R); \
VMOVDQU h, (7*32)(R)
#define loadState \
VMOVDQU (0*32)(BX), a; \
VMOVDQU (1*32)(BX), b; \
VMOVDQU (2*32)(BX), c; \
VMOVDQU (3*32)(BX), d; \
VMOVDQU (4*32)(BX), e; \
VMOVDQU (5*32)(BX), f; \
VMOVDQU (6*32)(BX), g; \
VMOVDQU (7*32)(BX), h
#define loadState(R) \
VMOVDQU (0*32)(R), a; \
VMOVDQU (1*32)(R), b; \
VMOVDQU (2*32)(R), c; \
VMOVDQU (3*32)(R), d; \
VMOVDQU (4*32)(R), e; \
VMOVDQU (5*32)(R), f; \
VMOVDQU (6*32)(R), g; \
VMOVDQU (7*32)(R), h
// r <<< n
#define VPROLD(r, n) \
@ -150,16 +153,49 @@ GLOBL r08_mask<>(SB), 8, $32
#define LOAD_T(index, T) \
VPBROADCASTD (index*4)(AX), T
// DST = X XOR Y XOR Z
#define FF0(X, Y, Z, DST) \
VPXOR X, Y, DST; \
VPXOR Z, DST, DST
// DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
#define FF1(X, Y, Z, TMP, DST) \
VPOR X, Y, DST; \
VPAND X, Y, TMP; \
VPAND Z, DST, DST; \
VPOR TMP, DST, DST
// DST = X XOR Y XOR Z
#define GG0(X, Y, Z, DST) \
FF0(X, Y, Z, DST)
// DST = (Y XOR Z) AND X XOR Z
#define GG1(X, Y, Z, DST) \
VPXOR Y, Z, DST; \
VPAND X, DST, DST; \
VPXOR Z, DST, DST
#define SS1SS2(index, a, e, SS1, SS2) \
VPROLD2(a, SS2, 12); \ // a <<< 12
LOAD_T(index, SS1); \ // const
VPADDD SS1, SS2, SS1; \
VPADDD e, SS1, SS1; \
VPROLD(SS1, 7); \ // SS1
VPXOR SS1, SS2, SS2; \ // SS2
#define COPY_RESULT(b, d, f, h, TT1, TT2) \
VPROLD(b, 9); \
VMOVDQU TT1, h; \ // TT1
VPROLD(f, 19); \
VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
VPXOR TT1, TT2, d
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
VPROLD2(a, Y13, 12); \ // a <<< 12
LOAD_T(index, Y12); \
VPADDD Y12, Y13, Y12; \
VPADDD e, Y12, Y12; \
VPROLD(Y12, 7); \ // SS1
VPXOR Y12, Y13, Y13; \ // SS2
SS1SS2(index, a, e, Y12, Y13); \
; \
VPXOR a, b, Y14; \
VPXOR c, Y14, Y14; \ // (a XOR b XOR c)
FF0(a, b, c, Y14); \
VPADDD d, Y14, Y14; \ // (a XOR b XOR c) + d
loadWord(Y10, index); \
loadWord(Y11, index+4); \
@ -168,17 +204,10 @@ GLOBL r08_mask<>(SB), 8, $32
VPADDD Y14, Y13, Y13; \ // TT1
VPADDD h, Y10, Y10; \ // Wt + h
VPADDD Y12, Y10, Y10; \ // Wt + h + SS1
VPXOR e, f, Y11; \
VPXOR g, Y11, Y11; \ // (e XOR f XOR g)
GG0(e, f, g, Y11); \
VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
VPROLD(b, 9); \
VMOVDQU Y13, h; \
VPROLD(f, 19); \
VPROLD2(Y10, Y13, 9); \ // tt2 <<< 9
VPSHUFB r08_mask<>(SB), Y13, Y11; \ // ROTL(17, tt2)
VPXOR Y10, Y13, Y13; \ // tt2 XOR ROTL(9, tt2)
VPXOR Y11, Y13, d
COPY_RESULT(b, d, f, h, Y13, Y10)
#define MESSAGE_SCHEDULE(index) \
loadWord(Y10, index+1); \ // Wj-3
@ -202,17 +231,9 @@ GLOBL r08_mask<>(SB), 8, $32
#define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ // Y11 is Wt+4 now, Pls do not use it
VPROLD2(a, Y13, 12); \ // a <<< 12
LOAD_T(index, Y12); \
VPADDD Y12, Y13, Y12; \
VPADDD e, Y12, Y12; \
VPROLD(Y12, 7); \ // SS1
VPXOR Y12, Y13, Y13; \ // SS2
SS1SS2(index, a, e, Y12, Y13); \
; \
VPOR a, b, Y14; \
VPAND a, b, Y10; \
VPAND c, Y14, Y14; \
VPOR Y10, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c)
FF1(a, b, c, Y10, Y14); \ // (a AND b) OR (a AND c) OR (b AND c)
VPADDD d, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
loadWord(Y10, index); \
VPXOR Y10, Y11, Y11; \ //Wt XOR Wt+4
@ -221,18 +242,10 @@ GLOBL r08_mask<>(SB), 8, $32
; \
VPADDD h, Y10, Y10; \ // Wt + h
VPADDD Y12, Y10, Y10; \ // Wt + h + SS1
VPXOR f, g, Y11; \
VPAND e, Y11, Y11; \
VPXOR g, Y11, Y11; \ // (f XOR g) AND e XOR g
GG1(e, f, g, Y11); \
VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
VPROLD(b, 9); \
VMOVDQU Y13, h; \
VPROLD(f, 19); \
VPROLD2(Y10, Y13, 9); \ // tt2 <<< 9
VPSHUFB r08_mask<>(SB), Y13, Y11; \ // ROTL(17, tt2)
VPXOR Y10, Y13, Y13; \ // tt2 XOR ROTL(9, tt2)
VPXOR Y11, Y13, d
COPY_RESULT(b, d, f, h, Y13, Y10)
// transposeMatrix8x8(dig **[8]uint32)
TEXT ·transposeMatrix8x8(SB),NOSPLIT,$0
@ -307,7 +320,7 @@ TEXT ·blockMultBy8(SB),NOSPLIT,$0
TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4)
saveState
saveState(BX)
MOVQ $·_K+0(SB), AX
MOVQ (0*8)(SI), srcPtr1
@ -324,7 +337,7 @@ loop:
prepare8Words(1)
// Need to load state again due to YMM registers are used in prepare8Words
loadState
loadState(BX)
ROUND_00_11(0, a, b, c, d, e, f, g, h)
ROUND_00_11(1, h, a, b, c, d, e, f, g)
@ -405,7 +418,7 @@ loop:
DECQ DX
JZ end
saveState
saveState(BX)
LEAQ 64(srcPtr1), srcPtr1
LEAQ 64(srcPtr2), srcPtr2
LEAQ 64(srcPtr3), srcPtr3
@ -446,33 +459,9 @@ TEXT ·copyResultsBy8(SB),NOSPLIT,$0
MOVQ dig+0(FP), DI
MOVQ dst+8(FP), SI
// load state
VMOVDQU (0*32)(DI), a
VMOVDQU (1*32)(DI), b
VMOVDQU (2*32)(DI), c
VMOVDQU (3*32)(DI), d
VMOVDQU (4*32)(DI), e
VMOVDQU (5*32)(DI), f
VMOVDQU (6*32)(DI), g
VMOVDQU (7*32)(DI), h
VPSHUFB flip_mask<>(SB), a, a
VPSHUFB flip_mask<>(SB), b, b
VPSHUFB flip_mask<>(SB), c, c
VPSHUFB flip_mask<>(SB), d, d
VPSHUFB flip_mask<>(SB), e, e
VPSHUFB flip_mask<>(SB), f, f
VPSHUFB flip_mask<>(SB), g, g
VPSHUFB flip_mask<>(SB), h, h
VMOVDQU a, (0*32)(SI)
VMOVDQU b, (1*32)(SI)
VMOVDQU c, (2*32)(SI)
VMOVDQU d, (3*32)(SI)
VMOVDQU e, (4*32)(SI)
VMOVDQU f, (5*32)(SI)
VMOVDQU g, (6*32)(SI)
VMOVDQU h, (7*32)(SI)
loadState(DI)
REV32(a, b, c, d, e, f, g, h)
saveState(SI)
VZEROUPPER
RET

View File

@ -69,19 +69,25 @@ GLOBL r08_mask<>(SB), 8, $16
#define tmp1 X8
#define tmp2 X9
#define storeState \
MOVOU a, (BX) \
MOVOU b, 16(BX) \
MOVOU c, 32(BX) \
MOVOU d, 48(BX) \
MOVOU e, 64(BX) \
MOVOU f, 80(BX) \
MOVOU g, 96(BX) \
MOVOU h, 112(BX)
#define storeState(R) \
MOVOU a, (R) \
MOVOU b, 16(R) \
MOVOU c, 32(R) \
MOVOU d, 48(R) \
MOVOU e, 64(R) \
MOVOU f, 80(R) \
MOVOU g, 96(R) \
MOVOU h, 112(R)
#define storeWord(W, j) MOVOU W, (128+(j)*16)(BX)
#define loadWord(W, i) MOVOU (128+(i)*16)(BX), W
#define SSE_REV32(a, b, c, d) \
PSHUFB flip_mask<>(SB), a; \
PSHUFB flip_mask<>(SB), b; \
PSHUFB flip_mask<>(SB), c; \
PSHUFB flip_mask<>(SB), d
#define prepare4Words(i) \
MOVOU (i*16)(R8), X10; \
MOVOU (i*16)(R9), X11; \
@ -89,11 +95,7 @@ GLOBL r08_mask<>(SB), 8, $16
MOVOU (i*16)(R11), X13; \
; \
SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
MOVOU flip_mask<>(SB), tmp1; \
PSHUFB tmp1, X10; \
PSHUFB tmp1, X11; \
PSHUFB tmp1, X12; \
PSHUFB tmp1, X13; \
SSE_REV32(X10, X11, X12, X13); \
; \
storeWord(X10, 4*i+0); \
storeWord(X11, 4*i+1); \
@ -111,18 +113,53 @@ GLOBL r08_mask<>(SB), 8, $16
PSRLL $(32-n), tmp1; \
POR tmp1, r
#define SSE_SS1SS2(index, a, e, TMP, SS1, SS2) \
MOVOU a, SS1; \
PROLD(SS1, 12); \
MOVOU SS1, SS2; \ // a <<< 12
LOAD_T(index, TMP); \
PADDL TMP, SS1; \
PADDL e, SS1; \
PROLD(SS1, 7); \ // SS1
PXOR SS1, SS2; \ // SS2
#define SSE_FF0(X, Y, Z, DST) \
MOVOU X, DST; \
PXOR Y, DST; \
PXOR Z, DST
#define SSE_FF1(X, Y, Z, TMP, DST) \
MOVOU X, DST; \
POR Y, DST; \
MOVOU X, TMP; \
PAND Y, TMP; \
PAND Z, DST; \
POR TMP, DST; \ // (a AND b) OR (a AND c) OR (b AND c)
#define SSE_GG0(X, Y, Z, DST) \
SSE_FF0(X, Y, Z, DST)
// DST = (Y XOR Z) AND X XOR Z
#define SSE_GG1(X, Y, Z, DST) \
MOVOU Y, DST; \
PXOR Z, DST; \
PAND X, DST; \
PXOR Z, DST
#define SSE_COPY_RESULT(b, d, f, h, TT1, TT2) \
PROLD(b, 9); \
MOVOU TT1, h; \
PROLD(f, 19); \
MOVOU TT2, TT1; \
PROLD(TT1, 9); \
PXOR TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
PSHUFB r08_mask<>(SB), TT1; \ // ROTL(17, tt2)
PXOR TT2, TT1; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU TT1, d
#define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
MOVOU a, X12; \
PROLD(X12, 12); \
MOVOU X12, X13; \ // a <<< 12
LOAD_T(index, tmp2); \
PADDL tmp2, X12; \
PADDL e, X12; \
PROLD(X12, 7); \ // SS1
PXOR X12, X13; \ // SS2
MOVOU b, X14; \
PXOR a, X14; \
PXOR c, X14; \ // (a XOR b XOR c)
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
SSE_FF0(a, b, c, X14); \
PADDL d, X14; \ // (a XOR b XOR c) + d
loadWord(X10, index); \
loadWord(X11, index+4); \
@ -131,20 +168,10 @@ GLOBL r08_mask<>(SB), 8, $16
PADDL X14, X13; \ // TT1
PADDL h, X10; \ // Wt + h
PADDL X12, X10; \ // Wt + h + SS1
MOVOU e, X11; \
PXOR f, X11; \
PXOR g, X11; \ // (e XOR f XOR g)
SSE_GG0(e, f, g, X11); \
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
PROLD(b, 9); \
MOVOU X13, h; \
PROLD(f, 19); \
MOVOU X10, X13; \
PROLD(X13, 9); \
PXOR X13, X10; \ // tt2 XOR ROTL(9, tt2)
PSHUFB r08_mask<>(SB), X13; \ // ROTL(17, tt2)
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU X13, d
SSE_COPY_RESULT(b, d, f, h, X13, X10)
#define MESSAGE_SCHEDULE(index) \
loadWord(X10, index+1); \ // Wj-3
@ -171,21 +198,9 @@ GLOBL r08_mask<>(SB), 8, $16
#define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
MOVOU a, X12; \
PROLD(X12, 12); \
MOVOU X12, X13; \ // a <<< 12
LOAD_T(index, tmp2); \
PADDL tmp2, X12; \
PADDL e, X12; \
PROLD(X12, 7); \ // SS1
PXOR X12, X13; \ // SS2
SSE_SS1SS2(index, a, e, tmp2, X12, X13); \
; \
MOVOU a, X14; \
POR b, X14; \
MOVOU a, X10; \
PAND b, X10; \
PAND c, X14; \
POR X10, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
SSE_FF1(a, b, c, X10, X14); \
PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
loadWord(X10, index); \
PXOR X10, X11; \ //Wt XOR Wt+4
@ -194,43 +209,48 @@ GLOBL r08_mask<>(SB), 8, $16
; \
PADDL h, X10; \ // Wt + h
PADDL X12, X10; \ // Wt + h + SS1
MOVOU f, X11; \
PXOR g, X11; \
PAND e, X11; \ // (f XOR g) AND e XOR g
PXOR g, X11; \
SSE_GG1(e, f, g, X11); \
PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
PROLD(b, 9); \
MOVOU X13, h; \
PROLD(f, 19); \
MOVOU X10, X13; \
PROLD(X13, 9); \
PXOR X13, X10; \ // tt2 XOR ROTL(9, tt2)
PSHUFB r08_mask<>(SB), X13; \ // ROTL(17, tt2)
PXOR X10, X13; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2)
MOVOU X13, d
SSE_COPY_RESULT(b, d, f, h, X13, X10)
// transpose matrix function, AVX/AVX2 version
// transpose matrix function, AVX version
// parameters:
// - r0: 128/256 bits register as input/output data
// - r1: 128/256 bits register as input/output data
// - r2: 128/256 bits register as input/output data
// - r3: 128/256 bits register as input/output data
// - tmp1: 128/256 bits temp register
// - tmp2: 128/256 bits temp register
// - r0: 128 bits register as input/output data
// - r1: 128 bits register as input/output data
// - r2: 128 bits register as input/output data
// - r3: 128 bits register as input/output data
// - tmp1: 128 bits temp register
// - tmp2: 128 bits temp register
#define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = [w15, w7, w14, w6, w11, w3, w10, w2] tmp2 = [w7, w3, w6, w2]
VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] r0 = [w5, w1, w4, w0]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = [w29, w21, w28, w20, w25, w17, w24, w16] tmp1 = [w13, w9, w12, w8]
VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = [w29, w21, w13, w5, w25, w17, w9, w1] r1 = [w13, w9, w5, w1]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = [w28, w20, w12, w4, w24, w16, w8, w0] r0 = [w12, w8, w4, w0]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]
VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = tmp2 = [w07, w03, w06, w02]
VPUNPCKLDQ r1, r0, r0; \ // r0 = r0 = [w05, w01, w04, w00]
VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = tmp1 = [w13, w09, w12, w08]
VPUNPCKHDQ r3, r2, r2; \ // r2 = r2 = [w15, w11, w14, w10]
VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = r1 = [w13, w09, w05, w01]
VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = r0 = [w12, w08, w04, w00]
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = r3 = [w15, w11, w07, w03]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = r2 = [w14, w10, w06, w02]
#define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX)
#define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W
#define avxStoreState(R) \
VMOVDQU a, (0*16)(R) \
VMOVDQU b, (1*16)(R) \
VMOVDQU c, (2*16)(R) \
VMOVDQU d, (3*16)(R) \
VMOVDQU e, (4*16)(R) \
VMOVDQU f, (5*16)(R) \
VMOVDQU g, (6*16)(R) \
VMOVDQU h, (7*16)(R)
#define AVX_REV32(a, b, c, d) \
VPSHUFB flip_mask<>(SB), a, a; \
VPSHUFB flip_mask<>(SB), b, b; \
VPSHUFB flip_mask<>(SB), c, c; \
VPSHUFB flip_mask<>(SB), d, d
#define avxPrepare4Words(i) \
VMOVDQU (i*16)(R8), X10; \
VMOVDQU (i*16)(R9), X11; \
@ -238,10 +258,7 @@ GLOBL r08_mask<>(SB), 8, $16
VMOVDQU (i*16)(R11), X13; \
; \
TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \
VPSHUFB flip_mask<>(SB), X10, X10; \
VPSHUFB flip_mask<>(SB), X11, X11; \
VPSHUFB flip_mask<>(SB), X12, X12; \
VPSHUFB flip_mask<>(SB), X13, X13; \
AVX_REV32(X10, X11, X12, X13); \
; \
avxStoreWord(X10, 4*i+0); \
avxStoreWord(X11, 4*i+1); \
@ -264,16 +281,49 @@ GLOBL r08_mask<>(SB), 8, $16
VPSRLD $(32-n), r, d; \
VPOR tmp1, d, d
#define AVX_SS1SS2(index, a, e, SS1, SS2) \
VPROLD2(a, SS2, 12); \ // a <<< 12
AVX_LOAD_T(index, SS1); \
VPADDD SS1, SS2, SS1; \
VPADDD e, SS1, SS1; \
VPROLD(SS1, 7); \ // SS1
VPXOR SS1, SS2, SS2
// DST = X XOR Y XOR Z
#define AVX_FF0(X, Y, Z, DST) \
VPXOR X, Y, DST; \
VPXOR Z, DST, DST
// DST = (X AND Y) OR (X AND Z) OR (Y AND Z)
#define AVX_FF1(X, Y, Z, TMP, DST) \
VPOR X, Y, DST; \
VPAND X, Y, TMP; \
VPAND Z, DST, DST; \
VPOR TMP, DST, DST
// DST = X XOR Y XOR Z
#define AVX_GG0(X, Y, Z, DST) \
AVX_FF0(X, Y, Z, DST)
// DST = (Y XOR Z) AND X XOR Z
#define AVX_GG1(X, Y, Z, DST) \
VPXOR Y, Z, DST; \
VPAND X, DST, DST; \
VPXOR Z, DST, DST
#define AVX_COPY_RESULT(b, d, f, h, TT1, TT2) \
VPROLD(b, 9); \
VMOVDQU TT1, h; \
VPROLD(f, 19); \
VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9
VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2)
VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2)
VPXOR TT2, TT1, d
#define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \
VPROLD2(a, X13, 12); \ // a <<< 12
AVX_LOAD_T(index, X12); \
VPADDD X12, X13, X12; \
VPADDD e, X12, X12; \
VPROLD(X12, 7); \ // SS1
VPXOR X12, X13, X13; \ // SS2
AVX_SS1SS2(index, a, e, X12, X13); \
; \
VPXOR a, b, X14; \
VPXOR c, X14, X14; \ // (a XOR b XOR c)
AVX_FF0(a, b, c, X14); \
VPADDD d, X14, X14; \ // (a XOR b XOR c) + d
avxLoadWord(X10, index); \
avxLoadWord(X11, index+4); \
@ -282,17 +332,10 @@ GLOBL r08_mask<>(SB), 8, $16
VPADDD X14, X13, X13; \ // TT1
VPADDD h, X10, X10; \ // Wt + h
VPADDD X12, X10, X10; \ // Wt + h + SS1
VPXOR e, f, X11; \
VPXOR g, X11, X11; \ // (e XOR f XOR g)
AVX_GG0(e, f, g, X11); \
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
VPROLD(b, 9); \
VMOVDQU X13, h; \
VPROLD(f, 19); \
VPROLD2(X10, X13, 9); \ // tt2 <<< 9
VPXOR X10, X13, X10; \ // tt2 XOR ROTL(9, tt2)
VPSHUFB r08_mask<>(SB), X13, X13; \ // ROTL(17, tt2)
VPXOR X10, X13, d
AVX_COPY_RESULT(b, d, f, h, X13, X10)
#define AVX_MESSAGE_SCHEDULE(index) \
avxLoadWord(X10, index+1); \ // Wj-3
@ -316,17 +359,9 @@ GLOBL r08_mask<>(SB), 8, $16
#define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \
AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it
VPROLD2(a, X13, 12); \ // a <<< 12
AVX_LOAD_T(index, X12); \
VPADDD X12, X13, X12; \
VPADDD e, X12, X12; \
VPROLD(X12, 7); \ // SS1
VPXOR X12, X13, X13; \ // SS2
AVX_SS1SS2(index, a, e, X12, X13); \
; \
VPOR a, b, X14; \
VPAND a, b, X10; \
VPAND c, X14, X14; \
VPOR X10, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c)
AVX_FF1(a, b, c, X10, X14); \
VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d
avxLoadWord(X10, index); \
VPXOR X10, X11, X11; \ //Wt XOR Wt+4
@ -335,18 +370,10 @@ GLOBL r08_mask<>(SB), 8, $16
; \
VPADDD h, X10, X10; \ // Wt + h
VPADDD X12, X10, X10; \ // Wt + h + SS1
VPXOR f, g, X11; \
VPAND e, X11, X11; \
VPXOR g, X11, X11; \ // (f XOR g) AND e XOR g
AVX_GG1(e, f, g, X11); \
VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
; \ // copy result
VPROLD(b, 9); \
VMOVDQU X13, h; \
VPROLD(f, 19); \
VPROLD2(X10, X13, 9); \ // tt2 <<< 9
VPXOR X10, X13, X10; \ // tt2 XOR ROTL(9, tt2)
VPSHUFB r08_mask<>(SB), X13, X13; \ // ROTL(17, tt2)
VPXOR X10, X13, d
AVX_COPY_RESULT(b, d, f, h, X13, X10)
// blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int)
TEXT ·blockMultBy4(SB),NOSPLIT,$0
@ -377,7 +404,7 @@ TEXT ·blockMultBy4(SB),NOSPLIT,$0
SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
// store state to temporary buffer
storeState
storeState(BX)
MOVQ $·_K+0(SB), AX
MOVQ (SI), R8
@ -479,7 +506,7 @@ loop:
DECQ DX
JZ end
storeState
storeState(BX)
LEAQ 64(R8), R8
LEAQ 64(R9), R9
LEAQ 64(R10), R10
@ -525,14 +552,7 @@ avx:
TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2)
TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2)
VMOVDQU a, (BX)
VMOVDQU b, 16(BX)
VMOVDQU c, 32(BX)
VMOVDQU d, 48(BX)
VMOVDQU e, 64(BX)
VMOVDQU f, 80(BX)
VMOVDQU g, 96(BX)
VMOVDQU h, 112(BX)
avxStoreState(BX)
MOVQ $·_K+0(SB), AX
MOVQ (SI), R8
@ -627,14 +647,7 @@ avxLoop:
JZ avxEnd
// store current state
VMOVDQU a, (0*16)(BX)
VMOVDQU b, (1*16)(BX)
VMOVDQU c, (2*16)(BX)
VMOVDQU d, (3*16)(BX)
VMOVDQU e, (4*16)(BX)
VMOVDQU f, (5*16)(BX)
VMOVDQU g, (6*16)(BX)
VMOVDQU h, (7*16)(BX)
avxStoreState(BX)
LEAQ 64(R8), R8
LEAQ 64(R9), R9
@ -680,23 +693,9 @@ TEXT ·copyResultsBy4(SB),NOSPLIT,$0
MOVOU (6*16)(DI), g
MOVOU (7*16)(DI), h
MOVOU flip_mask<>(SB), tmp1
PSHUFB tmp1, a
PSHUFB tmp1, b
PSHUFB tmp1, c
PSHUFB tmp1, d
PSHUFB tmp1, e
PSHUFB tmp1, f
PSHUFB tmp1, g
PSHUFB tmp1, h
MOVOU a, (0*16)(SI)
MOVOU b, (1*16)(SI)
MOVOU c, (2*16)(SI)
MOVOU d, (3*16)(SI)
MOVOU e, (4*16)(SI)
MOVOU f, (5*16)(SI)
MOVOU g, (6*16)(SI)
MOVOU h, (7*16)(SI)
SSE_REV32(a, b, c, d)
SSE_REV32(e, f, g, h)
storeState(SI)
RET
@ -711,22 +710,9 @@ avx:
VMOVDQU (6*16)(DI), g
VMOVDQU (7*16)(DI), h
VPSHUFB flip_mask<>(SB), a, a
VPSHUFB flip_mask<>(SB), b, b
VPSHUFB flip_mask<>(SB), c, c
VPSHUFB flip_mask<>(SB), d, d
VPSHUFB flip_mask<>(SB), e, e
VPSHUFB flip_mask<>(SB), f, f
VPSHUFB flip_mask<>(SB), g, g
VPSHUFB flip_mask<>(SB), h, h
AVX_REV32(a, b, c, d)
AVX_REV32(e, f, g, h)
VMOVDQU a, (0*16)(SI)
VMOVDQU b, (1*16)(SI)
VMOVDQU c, (2*16)(SI)
VMOVDQU d, (3*16)(SI)
VMOVDQU e, (4*16)(SI)
VMOVDQU f, (5*16)(SI)
VMOVDQU g, (6*16)(SI)
VMOVDQU h, (7*16)(SI)
avxStoreState(SI)
RET