internal/bigmod: unroll loop in addMulVVW for ppc64x

This commit is contained in:
Sun Yimin 2024-03-06 17:47:29 +08:00 committed by GitHub
parent 88ddf3e3d2
commit 5c85d63724
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 100 additions and 30 deletions

39
.github/workflows/test_ppc64.yaml vendored Normal file
View File

@ -0,0 +1,39 @@
# This workflow will build a golang project
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-go
name: ppc64le-qemu
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
jobs:
test:
strategy:
matrix:
go-version: [1.18.x]
arch: [ppc64le]
runs-on: ubuntu-latest
steps:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: ${{ matrix.go-version }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Check out code
uses: actions/checkout@v4
- name: Build
run: go build -v ./internal/bigmod/...
- name: Test
run: go test -v -short ./internal/bigmod/...
env:
GODEBUG: x509sha1=1
GOARCH: ${{ matrix.arch }}

View File

@ -8,49 +8,80 @@
// func addMulVVW256(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW256(SB), $0-32
MOVD $4, R22 // R22 = z_len
JMP addMulVVWy(SB)
MOVD $1, R6 // R6 = z_len/4
JMP addMulVVWy<>(SB)
// func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32
MOVD $16, R22 // R22 = z_len
JMP addMulVVWy(SB)
MOVD $4, R6 // R6 = z_len/4
JMP addMulVVWy<>(SB)
// func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32
MOVD $24, R22 // R22 = z_len
JMP addMulVVWy(SB)
MOVD $6, R6 // R6 = z_len/4
JMP addMulVVWy<>(SB)
// func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32
MOVD $32, R22 // R22 = z_len
JMP addMulVVWy(SB)
MOVD $8, R6 // R6 = z_len/4
JMP addMulVVWy<>(SB)
TEXT addMulVVWy(SB), NOFRAME|NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[]
MOVD x+8(FP), R8 // R8 = x[]
MOVD y+16(FP), R9 // R9 = y
// This local function expects to be called only by
// callers above. R6 contains the z length/4
// since 4 values are processed for each
// loop iteration, and is guaranteed to be > 0.
// If other callers are added this function might
// need to change.
TEXT addMulVVWy<>(SB), NOSPLIT, $0
MOVD z+0(FP), R3
MOVD x+8(FP), R4
MOVD y+16(FP), R5
MOVD R0, R3 // R3 will be the index register
CMP R0, R22
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
PCALIGN $16
MOVD $0, R9 // R9 = c = 0
MOVD R6, CTR // Initialize loop counter
PCALIGN $16
loop:
MOVD (R8)(R3), R20 // Load x[i]
MOVD (R10)(R3), R21 // Load z[i]
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
ADDC R21, R6 // R6 = z0
ADDZE R7 // R7 = z1
ADDC R4, R6 // R6 = z0 + c + 0
ADDZE R7, R4 // c += z1
MOVD R6, (R10)(R3) // Store z[i]
ADD $8, R3
BC 16, 0, loop // bdnz
MOVD 0(R4), R14 // x[i]
MOVD 8(R4), R16 // x[i+1]
MOVD 16(R4), R18 // x[i+2]
MOVD 24(R4), R20 // x[i+3]
MOVD 0(R3), R15 // z[i]
MOVD 8(R3), R17 // z[i+1]
MOVD 16(R3), R19 // z[i+2]
MOVD 24(R3), R21 // z[i+3]
MULLD R5, R14, R10 // low x[i]*y
MULHDU R5, R14, R11 // high x[i]*y
ADDC R15, R10
ADDZE R11
ADDC R9, R10
ADDZE R11, R9
MULLD R5, R16, R14 // low x[i+1]*y
MULHDU R5, R16, R15 // high x[i+1]*y
ADDC R17, R14
ADDZE R15
ADDC R9, R14
ADDZE R15, R9
MULLD R5, R18, R16 // low x[i+2]*y
MULHDU R5, R18, R17 // high x[i+2]*y
ADDC R19, R16
ADDZE R17
ADDC R9, R16
ADDZE R17, R9
MULLD R5, R20, R18 // low x[i+3]*y
MULHDU R5, R20, R19 // high x[i+3]*y
ADDC R21, R18
ADDZE R19
ADDC R9, R18
ADDZE R19, R9
MOVD R10, 0(R3) // z[i]
MOVD R14, 8(R3) // z[i+1]
MOVD R16, 16(R3) // z[i+2]
MOVD R18, 24(R3) // z[i+3]
ADD $32, R3
ADD $32, R4
BDNZ loop
done:
MOVD R4, c+24(FP)
MOVD R9, c+24(FP)
RET