crypto: arm/curve25519 - wire up NEON implementation

This ports the SUPERCOP implementation for usage in kernel space. In
addition to the usual header, macro, and style changes required for
kernel space, it makes a few small changes to the code:

  - The stack alignment is relaxed to 16 bytes.
  - Superfluous mov statements have been removed.
  - ldr for constants has been replaced with movw.
  - ldreq has been replaced with moveq.
  - The str epilogue has been made more idiomatic.
  - SIMD registers are not pushed and popped at the beginning and end.
  - The prologue and epilogue have been made idiomatic.
  - A hole has been removed from the stack, saving 32 bytes.
  - We write-back the base register whenever possible for vld1.8.
  - Some multiplications have been reordered for better A7 performance.

There are more opportunities for cleanup, since this code is from qhasm,
which doesn't always do the most opportune thing. But even prior to
extensive hand optimizations, this code delivers significant performance
improvements (given in get_cycles() per call):

		      ----------- -------------
	             | generic C | this commit |
	 ------------ ----------- -------------
	| Cortex-A7  |     49136 |       22395 |
	 ------------ ----------- -------------
	| Cortex-A17 |     17326 |        4983 |
	 ------------ ----------- -------------

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
[ardb: - move to arch/arm/crypto
       - wire into lib/crypto framework
       - implement crypto API KPP hooks ]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Jason A. Donenfeld 2019-11-08 13:22:38 +01:00 committed by Herbert Xu
parent f0fb006b60
commit d8f1308a02
4 changed files with 287 additions and 195 deletions

View File

@ -142,4 +142,10 @@ config CRYPTO_NHPOLY1305_NEON
depends on KERNEL_MODE_NEON
select CRYPTO_NHPOLY1305
config CRYPTO_CURVE25519_NEON
tristate "NEON accelerated Curve25519 scalar multiplication library"
depends on KERNEL_MODE_NEON
select CRYPTO_LIB_CURVE25519_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
endif

View File

@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@ -38,6 +39,7 @@ chacha-neon-y := chacha-scalar-core.o chacha-glue.o
chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
poly1305-arm-y := poly1305-core.o poly1305-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
curve25519-neon-y := curve25519-core.o curve25519-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@

View File

@ -1,43 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Public domain code from Daniel J. Bernstein and Peter Schwabe, from
* SUPERCOP's curve25519/neon2/scalarmult.s.
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
* manually reworked for use in kernel space.
*/
.fpu neon
#include <linux/linkage.h>
.text
.fpu neon
.arch armv7-a
.align 4
.global _crypto_scalarmult_curve25519_neon2
.global crypto_scalarmult_curve25519_neon2
.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
.type crypto_scalarmult_curve25519_neon2 STT_FUNC
_crypto_scalarmult_curve25519_neon2:
crypto_scalarmult_curve25519_neon2:
vpush {q4, q5, q6, q7}
mov r12, sp
sub sp, sp, #736
and sp, sp, #0xffffffe0
strd r4, [sp, #0]
strd r6, [sp, #8]
strd r8, [sp, #16]
strd r10, [sp, #24]
str r12, [sp, #480]
str r14, [sp, #484]
mov r0, r0
mov r1, r1
mov r2, r2
add r3, sp, #32
ldr r4, =0
ldr r5, =254
ENTRY(curve25519_neon)
push {r4-r11, lr}
mov ip, sp
sub r3, sp, #704
and r3, r3, #0xfffffff0
mov sp, r3
movw r4, #0
movw r5, #254
vmov.i32 q0, #1
vshr.u64 q1, q0, #7
vshr.u64 q0, q0, #8
vmov.i32 d4, #19
vmov.i32 d5, #38
add r6, sp, #512
vst1.8 {d2-d3}, [r6, : 128]
add r6, sp, #528
vst1.8 {d0-d1}, [r6, : 128]
add r6, sp, #544
add r6, sp, #480
vst1.8 {d2-d3}, [r6, : 128]!
vst1.8 {d0-d1}, [r6, : 128]!
vst1.8 {d4-d5}, [r6, : 128]
add r6, r3, #0
vmov.i32 q2, #0
@ -45,12 +37,12 @@
vst1.8 {d4-d5}, [r6, : 128]!
vst1.8 d4, [r6, : 64]
add r6, r3, #0
ldr r7, =960
movw r7, #960
sub r7, r7, #2
neg r7, r7
sub r7, r7, r7, LSL #7
str r7, [r6]
add r6, sp, #704
add r6, sp, #672
vld1.8 {d4-d5}, [r1]!
vld1.8 {d6-d7}, [r1]
vst1.8 {d4-d5}, [r6, : 128]!
@ -212,15 +204,15 @@
vst1.8 {d0-d1}, [r6, : 128]!
vst1.8 {d2-d3}, [r6, : 128]!
vst1.8 d4, [r6, : 64]
._mainloop:
.Lmainloop:
mov r2, r5, LSR #3
and r6, r5, #7
ldrb r2, [r1, r2]
mov r2, r2, LSR r6
and r2, r2, #1
str r5, [sp, #488]
str r5, [sp, #456]
eor r4, r4, r2
str r2, [sp, #492]
str r2, [sp, #460]
neg r2, r4
add r4, r3, #96
add r5, r3, #192
@ -291,7 +283,7 @@
vsub.i32 q0, q1, q3
vst1.8 d4, [r4, : 64]
vst1.8 d0, [r6, : 64]
add r2, sp, #544
add r2, sp, #512
add r4, r3, #96
add r5, r3, #144
vld1.8 {d0-d1}, [r2, : 128]
@ -361,14 +353,13 @@
vmlal.s32 q0, d12, d8
vmlal.s32 q0, d13, d17
vmlal.s32 q0, d6, d6
add r2, sp, #512
vld1.8 {d18-d19}, [r2, : 128]
add r2, sp, #480
vld1.8 {d18-d19}, [r2, : 128]!
vmull.s32 q3, d16, d7
vmlal.s32 q3, d10, d15
vmlal.s32 q3, d11, d14
vmlal.s32 q3, d12, d9
vmlal.s32 q3, d13, d8
add r2, sp, #528
vld1.8 {d8-d9}, [r2, : 128]
vadd.i64 q5, q12, q9
vadd.i64 q6, q15, q9
@ -502,22 +493,19 @@
vadd.i32 q5, q5, q0
vtrn.32 q11, q14
vadd.i32 q6, q6, q3
add r2, sp, #560
add r2, sp, #528
vadd.i32 q10, q10, q2
vtrn.32 d24, d25
vst1.8 {d12-d13}, [r2, : 128]
vst1.8 {d12-d13}, [r2, : 128]!
vshl.i32 q6, q13, #1
add r2, sp, #576
vst1.8 {d20-d21}, [r2, : 128]
vst1.8 {d20-d21}, [r2, : 128]!
vshl.i32 q10, q14, #1
add r2, sp, #592
vst1.8 {d12-d13}, [r2, : 128]
vst1.8 {d12-d13}, [r2, : 128]!
vshl.i32 q15, q12, #1
vadd.i32 q8, q8, q4
vext.32 d10, d31, d30, #0
vadd.i32 q7, q7, q1
add r2, sp, #608
vst1.8 {d16-d17}, [r2, : 128]
vst1.8 {d16-d17}, [r2, : 128]!
vmull.s32 q8, d18, d5
vmlal.s32 q8, d26, d4
vmlal.s32 q8, d19, d9
@ -528,8 +516,7 @@
vmlal.s32 q8, d29, d1
vmlal.s32 q8, d24, d6
vmlal.s32 q8, d25, d0
add r2, sp, #624
vst1.8 {d14-d15}, [r2, : 128]
vst1.8 {d14-d15}, [r2, : 128]!
vmull.s32 q2, d18, d4
vmlal.s32 q2, d12, d9
vmlal.s32 q2, d13, d8
@ -537,8 +524,7 @@
vmlal.s32 q2, d22, d2
vmlal.s32 q2, d23, d1
vmlal.s32 q2, d24, d0
add r2, sp, #640
vst1.8 {d20-d21}, [r2, : 128]
vst1.8 {d20-d21}, [r2, : 128]!
vmull.s32 q7, d18, d9
vmlal.s32 q7, d26, d3
vmlal.s32 q7, d19, d8
@ -547,14 +533,12 @@
vmlal.s32 q7, d28, d1
vmlal.s32 q7, d23, d6
vmlal.s32 q7, d29, d0
add r2, sp, #656
vst1.8 {d10-d11}, [r2, : 128]
vst1.8 {d10-d11}, [r2, : 128]!
vmull.s32 q5, d18, d3
vmlal.s32 q5, d19, d2
vmlal.s32 q5, d22, d1
vmlal.s32 q5, d23, d0
vmlal.s32 q5, d12, d8
add r2, sp, #672
vst1.8 {d16-d17}, [r2, : 128]
vmull.s32 q4, d18, d8
vmlal.s32 q4, d26, d2
@ -566,7 +550,7 @@
vmlal.s32 q8, d26, d1
vmlal.s32 q8, d19, d6
vmlal.s32 q8, d27, d0
add r2, sp, #576
add r2, sp, #544
vld1.8 {d20-d21}, [r2, : 128]
vmlal.s32 q7, d24, d21
vmlal.s32 q7, d25, d20
@ -575,32 +559,30 @@
vmlal.s32 q8, d22, d21
vmlal.s32 q8, d28, d20
vmlal.s32 q5, d24, d20
add r2, sp, #576
vst1.8 {d14-d15}, [r2, : 128]
vmull.s32 q7, d18, d6
vmlal.s32 q7, d26, d0
add r2, sp, #656
add r2, sp, #624
vld1.8 {d30-d31}, [r2, : 128]
vmlal.s32 q2, d30, d21
vmlal.s32 q7, d19, d21
vmlal.s32 q7, d27, d20
add r2, sp, #624
add r2, sp, #592
vld1.8 {d26-d27}, [r2, : 128]
vmlal.s32 q4, d25, d27
vmlal.s32 q8, d29, d27
vmlal.s32 q8, d25, d26
vmlal.s32 q7, d28, d27
vmlal.s32 q7, d29, d26
add r2, sp, #608
add r2, sp, #576
vld1.8 {d28-d29}, [r2, : 128]
vmlal.s32 q4, d24, d29
vmlal.s32 q8, d23, d29
vmlal.s32 q8, d24, d28
vmlal.s32 q7, d22, d29
vmlal.s32 q7, d23, d28
add r2, sp, #608
vst1.8 {d8-d9}, [r2, : 128]
add r2, sp, #560
add r2, sp, #528
vld1.8 {d8-d9}, [r2, : 128]
vmlal.s32 q7, d24, d9
vmlal.s32 q7, d25, d31
@ -621,36 +603,36 @@
vmlal.s32 q0, d23, d26
vmlal.s32 q0, d24, d31
vmlal.s32 q0, d19, d20
add r2, sp, #640
add r2, sp, #608
vld1.8 {d18-d19}, [r2, : 128]
vmlal.s32 q2, d18, d7
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d18, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d18, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d18, d28
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d18, d29
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d19, d28
add r2, sp, #592
add r2, sp, #560
vld1.8 {d18-d19}, [r2, : 128]
add r2, sp, #512
add r2, sp, #480
vld1.8 {d22-d23}, [r2, : 128]
vmlal.s32 q5, d19, d7
vmlal.s32 q0, d18, d21
vmlal.s32 q0, d19, d29
vmlal.s32 q6, d18, d6
add r2, sp, #528
add r2, sp, #496
vld1.8 {d6-d7}, [r2, : 128]
vmlal.s32 q6, d19, d21
add r2, sp, #576
add r2, sp, #544
vld1.8 {d18-d19}, [r2, : 128]
vmlal.s32 q0, d30, d8
add r2, sp, #672
add r2, sp, #640
vld1.8 {d20-d21}, [r2, : 128]
vmlal.s32 q5, d30, d29
add r2, sp, #608
add r2, sp, #576
vld1.8 {d24-d25}, [r2, : 128]
vmlal.s32 q1, d30, d28
vadd.i64 q13, q0, q11
@ -823,22 +805,19 @@
vadd.i32 q5, q5, q0
vtrn.32 q11, q14
vadd.i32 q6, q6, q3
add r2, sp, #560
add r2, sp, #528
vadd.i32 q10, q10, q2
vtrn.32 d24, d25
vst1.8 {d12-d13}, [r2, : 128]
vst1.8 {d12-d13}, [r2, : 128]!
vshl.i32 q6, q13, #1
add r2, sp, #576
vst1.8 {d20-d21}, [r2, : 128]
vst1.8 {d20-d21}, [r2, : 128]!
vshl.i32 q10, q14, #1
add r2, sp, #592
vst1.8 {d12-d13}, [r2, : 128]
vst1.8 {d12-d13}, [r2, : 128]!
vshl.i32 q15, q12, #1
vadd.i32 q8, q8, q4
vext.32 d10, d31, d30, #0
vadd.i32 q7, q7, q1
add r2, sp, #608
vst1.8 {d16-d17}, [r2, : 128]
vst1.8 {d16-d17}, [r2, : 128]!
vmull.s32 q8, d18, d5
vmlal.s32 q8, d26, d4
vmlal.s32 q8, d19, d9
@ -849,8 +828,7 @@
vmlal.s32 q8, d29, d1
vmlal.s32 q8, d24, d6
vmlal.s32 q8, d25, d0
add r2, sp, #624
vst1.8 {d14-d15}, [r2, : 128]
vst1.8 {d14-d15}, [r2, : 128]!
vmull.s32 q2, d18, d4
vmlal.s32 q2, d12, d9
vmlal.s32 q2, d13, d8
@ -858,8 +836,7 @@
vmlal.s32 q2, d22, d2
vmlal.s32 q2, d23, d1
vmlal.s32 q2, d24, d0
add r2, sp, #640
vst1.8 {d20-d21}, [r2, : 128]
vst1.8 {d20-d21}, [r2, : 128]!
vmull.s32 q7, d18, d9
vmlal.s32 q7, d26, d3
vmlal.s32 q7, d19, d8
@ -868,15 +845,13 @@
vmlal.s32 q7, d28, d1
vmlal.s32 q7, d23, d6
vmlal.s32 q7, d29, d0
add r2, sp, #656
vst1.8 {d10-d11}, [r2, : 128]
vst1.8 {d10-d11}, [r2, : 128]!
vmull.s32 q5, d18, d3
vmlal.s32 q5, d19, d2
vmlal.s32 q5, d22, d1
vmlal.s32 q5, d23, d0
vmlal.s32 q5, d12, d8
add r2, sp, #672
vst1.8 {d16-d17}, [r2, : 128]
vst1.8 {d16-d17}, [r2, : 128]!
vmull.s32 q4, d18, d8
vmlal.s32 q4, d26, d2
vmlal.s32 q4, d19, d7
@ -887,7 +862,7 @@
vmlal.s32 q8, d26, d1
vmlal.s32 q8, d19, d6
vmlal.s32 q8, d27, d0
add r2, sp, #576
add r2, sp, #544
vld1.8 {d20-d21}, [r2, : 128]
vmlal.s32 q7, d24, d21
vmlal.s32 q7, d25, d20
@ -896,32 +871,30 @@
vmlal.s32 q8, d22, d21
vmlal.s32 q8, d28, d20
vmlal.s32 q5, d24, d20
add r2, sp, #576
vst1.8 {d14-d15}, [r2, : 128]
vmull.s32 q7, d18, d6
vmlal.s32 q7, d26, d0
add r2, sp, #656
add r2, sp, #624
vld1.8 {d30-d31}, [r2, : 128]
vmlal.s32 q2, d30, d21
vmlal.s32 q7, d19, d21
vmlal.s32 q7, d27, d20
add r2, sp, #624
add r2, sp, #592
vld1.8 {d26-d27}, [r2, : 128]
vmlal.s32 q4, d25, d27
vmlal.s32 q8, d29, d27
vmlal.s32 q8, d25, d26
vmlal.s32 q7, d28, d27
vmlal.s32 q7, d29, d26
add r2, sp, #608
add r2, sp, #576
vld1.8 {d28-d29}, [r2, : 128]
vmlal.s32 q4, d24, d29
vmlal.s32 q8, d23, d29
vmlal.s32 q8, d24, d28
vmlal.s32 q7, d22, d29
vmlal.s32 q7, d23, d28
add r2, sp, #608
vst1.8 {d8-d9}, [r2, : 128]
add r2, sp, #560
add r2, sp, #528
vld1.8 {d8-d9}, [r2, : 128]
vmlal.s32 q7, d24, d9
vmlal.s32 q7, d25, d31
@ -942,36 +915,36 @@
vmlal.s32 q0, d23, d26
vmlal.s32 q0, d24, d31
vmlal.s32 q0, d19, d20
add r2, sp, #640
add r2, sp, #608
vld1.8 {d18-d19}, [r2, : 128]
vmlal.s32 q2, d18, d7
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d18, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d18, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d18, d28
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d18, d29
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d19, d28
add r2, sp, #592
add r2, sp, #560
vld1.8 {d18-d19}, [r2, : 128]
add r2, sp, #512
add r2, sp, #480
vld1.8 {d22-d23}, [r2, : 128]
vmlal.s32 q5, d19, d7
vmlal.s32 q0, d18, d21
vmlal.s32 q0, d19, d29
vmlal.s32 q6, d18, d6
add r2, sp, #528
add r2, sp, #496
vld1.8 {d6-d7}, [r2, : 128]
vmlal.s32 q6, d19, d21
add r2, sp, #576
add r2, sp, #544
vld1.8 {d18-d19}, [r2, : 128]
vmlal.s32 q0, d30, d8
add r2, sp, #672
add r2, sp, #640
vld1.8 {d20-d21}, [r2, : 128]
vmlal.s32 q5, d30, d29
add r2, sp, #608
add r2, sp, #576
vld1.8 {d24-d25}, [r2, : 128]
vmlal.s32 q1, d30, d28
vadd.i64 q13, q0, q11
@ -1069,7 +1042,7 @@
sub r4, r4, #24
vst1.8 d0, [r2, : 64]
vst1.8 d1, [r4, : 64]
add r2, sp, #544
add r2, sp, #512
add r4, r3, #144
add r5, r3, #192
vld1.8 {d0-d1}, [r2, : 128]
@ -1139,14 +1112,13 @@
vmlal.s32 q0, d12, d8
vmlal.s32 q0, d13, d17
vmlal.s32 q0, d6, d6
add r2, sp, #512
vld1.8 {d18-d19}, [r2, : 128]
add r2, sp, #480
vld1.8 {d18-d19}, [r2, : 128]!
vmull.s32 q3, d16, d7
vmlal.s32 q3, d10, d15
vmlal.s32 q3, d11, d14
vmlal.s32 q3, d12, d9
vmlal.s32 q3, d13, d8
add r2, sp, #528
vld1.8 {d8-d9}, [r2, : 128]
vadd.i64 q5, q12, q9
vadd.i64 q6, q15, q9
@ -1295,22 +1267,19 @@
vadd.i32 q5, q5, q0
vtrn.32 q11, q14
vadd.i32 q6, q6, q3
add r2, sp, #560
add r2, sp, #528
vadd.i32 q10, q10, q2
vtrn.32 d24, d25
vst1.8 {d12-d13}, [r2, : 128]
vst1.8 {d12-d13}, [r2, : 128]!
vshl.i32 q6, q13, #1
add r2, sp, #576
vst1.8 {d20-d21}, [r2, : 128]
vst1.8 {d20-d21}, [r2, : 128]!
vshl.i32 q10, q14, #1
add r2, sp, #592
vst1.8 {d12-d13}, [r2, : 128]
vst1.8 {d12-d13}, [r2, : 128]!
vshl.i32 q15, q12, #1
vadd.i32 q8, q8, q4
vext.32 d10, d31, d30, #0
vadd.i32 q7, q7, q1
add r2, sp, #608
vst1.8 {d16-d17}, [r2, : 128]
vst1.8 {d16-d17}, [r2, : 128]!
vmull.s32 q8, d18, d5
vmlal.s32 q8, d26, d4
vmlal.s32 q8, d19, d9
@ -1321,8 +1290,7 @@
vmlal.s32 q8, d29, d1
vmlal.s32 q8, d24, d6
vmlal.s32 q8, d25, d0
add r2, sp, #624
vst1.8 {d14-d15}, [r2, : 128]
vst1.8 {d14-d15}, [r2, : 128]!
vmull.s32 q2, d18, d4
vmlal.s32 q2, d12, d9
vmlal.s32 q2, d13, d8
@ -1330,8 +1298,7 @@
vmlal.s32 q2, d22, d2
vmlal.s32 q2, d23, d1
vmlal.s32 q2, d24, d0
add r2, sp, #640
vst1.8 {d20-d21}, [r2, : 128]
vst1.8 {d20-d21}, [r2, : 128]!
vmull.s32 q7, d18, d9
vmlal.s32 q7, d26, d3
vmlal.s32 q7, d19, d8
@ -1340,15 +1307,13 @@
vmlal.s32 q7, d28, d1
vmlal.s32 q7, d23, d6
vmlal.s32 q7, d29, d0
add r2, sp, #656
vst1.8 {d10-d11}, [r2, : 128]
vst1.8 {d10-d11}, [r2, : 128]!
vmull.s32 q5, d18, d3
vmlal.s32 q5, d19, d2
vmlal.s32 q5, d22, d1
vmlal.s32 q5, d23, d0
vmlal.s32 q5, d12, d8
add r2, sp, #672
vst1.8 {d16-d17}, [r2, : 128]
vst1.8 {d16-d17}, [r2, : 128]!
vmull.s32 q4, d18, d8
vmlal.s32 q4, d26, d2
vmlal.s32 q4, d19, d7
@ -1359,7 +1324,7 @@
vmlal.s32 q8, d26, d1
vmlal.s32 q8, d19, d6
vmlal.s32 q8, d27, d0
add r2, sp, #576
add r2, sp, #544
vld1.8 {d20-d21}, [r2, : 128]
vmlal.s32 q7, d24, d21
vmlal.s32 q7, d25, d20
@ -1368,32 +1333,30 @@
vmlal.s32 q8, d22, d21
vmlal.s32 q8, d28, d20
vmlal.s32 q5, d24, d20
add r2, sp, #576
vst1.8 {d14-d15}, [r2, : 128]
vmull.s32 q7, d18, d6
vmlal.s32 q7, d26, d0
add r2, sp, #656
add r2, sp, #624
vld1.8 {d30-d31}, [r2, : 128]
vmlal.s32 q2, d30, d21
vmlal.s32 q7, d19, d21
vmlal.s32 q7, d27, d20
add r2, sp, #624
add r2, sp, #592
vld1.8 {d26-d27}, [r2, : 128]
vmlal.s32 q4, d25, d27
vmlal.s32 q8, d29, d27
vmlal.s32 q8, d25, d26
vmlal.s32 q7, d28, d27
vmlal.s32 q7, d29, d26
add r2, sp, #608
add r2, sp, #576
vld1.8 {d28-d29}, [r2, : 128]
vmlal.s32 q4, d24, d29
vmlal.s32 q8, d23, d29
vmlal.s32 q8, d24, d28
vmlal.s32 q7, d22, d29
vmlal.s32 q7, d23, d28
add r2, sp, #608
vst1.8 {d8-d9}, [r2, : 128]
add r2, sp, #560
add r2, sp, #528
vld1.8 {d8-d9}, [r2, : 128]
vmlal.s32 q7, d24, d9
vmlal.s32 q7, d25, d31
@ -1414,36 +1377,36 @@
vmlal.s32 q0, d23, d26
vmlal.s32 q0, d24, d31
vmlal.s32 q0, d19, d20
add r2, sp, #640
add r2, sp, #608
vld1.8 {d18-d19}, [r2, : 128]
vmlal.s32 q2, d18, d7
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d18, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d18, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d18, d28
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d18, d29
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d19, d28
add r2, sp, #592
add r2, sp, #560
vld1.8 {d18-d19}, [r2, : 128]
add r2, sp, #512
add r2, sp, #480
vld1.8 {d22-d23}, [r2, : 128]
vmlal.s32 q5, d19, d7
vmlal.s32 q0, d18, d21
vmlal.s32 q0, d19, d29
vmlal.s32 q6, d18, d6
add r2, sp, #528
add r2, sp, #496
vld1.8 {d6-d7}, [r2, : 128]
vmlal.s32 q6, d19, d21
add r2, sp, #576
add r2, sp, #544
vld1.8 {d18-d19}, [r2, : 128]
vmlal.s32 q0, d30, d8
add r2, sp, #672
add r2, sp, #640
vld1.8 {d20-d21}, [r2, : 128]
vmlal.s32 q5, d30, d29
add r2, sp, #608
add r2, sp, #576
vld1.8 {d24-d25}, [r2, : 128]
vmlal.s32 q1, d30, d28
vadd.i64 q13, q0, q11
@ -1541,10 +1504,10 @@
sub r4, r4, #24
vst1.8 d0, [r2, : 64]
vst1.8 d1, [r4, : 64]
ldr r2, [sp, #488]
ldr r4, [sp, #492]
ldr r2, [sp, #456]
ldr r4, [sp, #460]
subs r5, r2, #1
bge ._mainloop
bge .Lmainloop
add r1, r3, #144
add r2, r3, #336
vld1.8 {d0-d1}, [r1, : 128]!
@ -1553,41 +1516,41 @@
vst1.8 {d0-d1}, [r2, : 128]!
vst1.8 {d2-d3}, [r2, : 128]!
vst1.8 d4, [r2, : 64]
ldr r1, =0
._invertloop:
movw r1, #0
.Linvertloop:
add r2, r3, #144
ldr r4, =0
ldr r5, =2
movw r4, #0
movw r5, #2
cmp r1, #1
ldreq r5, =1
moveq r5, #1
addeq r2, r3, #336
addeq r4, r3, #48
cmp r1, #2
ldreq r5, =1
moveq r5, #1
addeq r2, r3, #48
cmp r1, #3
ldreq r5, =5
moveq r5, #5
addeq r4, r3, #336
cmp r1, #4
ldreq r5, =10
moveq r5, #10
cmp r1, #5
ldreq r5, =20
moveq r5, #20
cmp r1, #6
ldreq r5, =10
moveq r5, #10
addeq r2, r3, #336
addeq r4, r3, #336
cmp r1, #7
ldreq r5, =50
moveq r5, #50
cmp r1, #8
ldreq r5, =100
moveq r5, #100
cmp r1, #9
ldreq r5, =50
moveq r5, #50
addeq r2, r3, #336
cmp r1, #10
ldreq r5, =5
moveq r5, #5
addeq r2, r3, #48
cmp r1, #11
ldreq r5, =0
moveq r5, #0
addeq r2, r3, #96
add r6, r3, #144
add r7, r3, #288
@ -1598,8 +1561,8 @@
vst1.8 {d2-d3}, [r7, : 128]!
vst1.8 d4, [r7, : 64]
cmp r5, #0
beq ._skipsquaringloop
._squaringloop:
beq .Lskipsquaringloop
.Lsquaringloop:
add r6, r3, #288
add r7, r3, #288
add r8, r3, #288
@ -1611,7 +1574,7 @@
vld1.8 {d6-d7}, [r7, : 128]!
vld1.8 {d9}, [r7, : 64]
vld1.8 {d10-d11}, [r6, : 128]!
add r7, sp, #416
add r7, sp, #384
vld1.8 {d12-d13}, [r6, : 128]!
vmul.i32 q7, q2, q0
vld1.8 {d8}, [r6, : 64]
@ -1726,7 +1689,7 @@
vext.32 d10, d6, d6, #0
vmov.i32 q1, #0xffffffff
vshl.i64 q4, q1, #25
add r7, sp, #512
add r7, sp, #480
vld1.8 {d14-d15}, [r7, : 128]
vadd.i64 q9, q2, q7
vshl.i64 q1, q1, #26
@ -1735,7 +1698,7 @@
vadd.i64 q5, q5, q10
vand q9, q9, q1
vld1.8 {d16}, [r6, : 64]!
add r6, sp, #528
add r6, sp, #496
vld1.8 {d20-d21}, [r6, : 128]
vadd.i64 q11, q5, q10
vsub.i64 q2, q2, q9
@ -1789,8 +1752,8 @@
sub r6, r6, #32
vst1.8 d4, [r6, : 64]
subs r5, r5, #1
bhi ._squaringloop
._skipsquaringloop:
bhi .Lsquaringloop
.Lskipsquaringloop:
mov r2, r2
add r5, r3, #288
add r6, r3, #144
@ -1802,7 +1765,7 @@
vld1.8 {d6-d7}, [r5, : 128]!
vld1.8 {d9}, [r5, : 64]
vld1.8 {d10-d11}, [r2, : 128]!
add r5, sp, #416
add r5, sp, #384
vld1.8 {d12-d13}, [r2, : 128]!
vmul.i32 q7, q2, q0
vld1.8 {d8}, [r2, : 64]
@ -1917,7 +1880,7 @@
vext.32 d10, d6, d6, #0
vmov.i32 q1, #0xffffffff
vshl.i64 q4, q1, #25
add r5, sp, #512
add r5, sp, #480
vld1.8 {d14-d15}, [r5, : 128]
vadd.i64 q9, q2, q7
vshl.i64 q1, q1, #26
@ -1926,7 +1889,7 @@
vadd.i64 q5, q5, q10
vand q9, q9, q1
vld1.8 {d16}, [r2, : 64]!
add r2, sp, #528
add r2, sp, #496
vld1.8 {d20-d21}, [r2, : 128]
vadd.i64 q11, q5, q10
vsub.i64 q2, q2, q9
@ -1980,7 +1943,7 @@
sub r2, r2, #32
vst1.8 d4, [r2, : 64]
cmp r4, #0
beq ._skippostcopy
beq .Lskippostcopy
add r2, r3, #144
mov r4, r4
vld1.8 {d0-d1}, [r2, : 128]!
@ -1989,9 +1952,9 @@
vst1.8 {d0-d1}, [r4, : 128]!
vst1.8 {d2-d3}, [r4, : 128]!
vst1.8 d4, [r4, : 64]
._skippostcopy:
.Lskippostcopy:
cmp r1, #1
bne ._skipfinalcopy
bne .Lskipfinalcopy
add r2, r3, #288
add r4, r3, #144
vld1.8 {d0-d1}, [r2, : 128]!
@ -2000,10 +1963,10 @@
vst1.8 {d0-d1}, [r4, : 128]!
vst1.8 {d2-d3}, [r4, : 128]!
vst1.8 d4, [r4, : 64]
._skipfinalcopy:
.Lskipfinalcopy:
add r1, r1, #1
cmp r1, #12
blo ._invertloop
blo .Linvertloop
add r1, r3, #144
ldr r2, [r1], #4
ldr r3, [r1], #4
@ -2085,21 +2048,15 @@
add r8, r8, r10, LSL #12
mov r9, r10, LSR #20
add r1, r9, r1, LSL #6
str r2, [r0], #4
str r3, [r0], #4
str r4, [r0], #4
str r5, [r0], #4
str r6, [r0], #4
str r7, [r0], #4
str r8, [r0], #4
str r1, [r0]
ldrd r4, [sp, #0]
ldrd r6, [sp, #8]
ldrd r8, [sp, #16]
ldrd r10, [sp, #24]
ldr r12, [sp, #480]
ldr r14, [sp, #484]
ldr r0, =0
mov sp, r12
vpop {q4, q5, q6, q7}
bx lr
str r2, [r0]
str r3, [r0, #4]
str r4, [r0, #8]
str r5, [r0, #12]
str r6, [r0, #16]
str r7, [r0, #20]
str r8, [r0, #24]
str r1, [r0, #28]
movw r0, #0
mov sp, ip
pop {r4-r11, pc}
ENDPROC(curve25519_neon)

View File

@ -0,0 +1,127 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
* manually reworked for use in kernel space.
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/internal/kpp.h>
#include <crypto/internal/simd.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/jump_label.h>
#include <crypto/curve25519.h>
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE],
const u8 basepoint[CURVE25519_KEY_SIZE]);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
const u8 scalar[CURVE25519_KEY_SIZE],
const u8 point[CURVE25519_KEY_SIZE])
{
if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
kernel_neon_begin();
curve25519_neon(out, scalar, point);
kernel_neon_end();
} else {
curve25519_generic(out, scalar, point);
}
}
EXPORT_SYMBOL(curve25519_arch);
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
unsigned int len)
{
u8 *secret = kpp_tfm_ctx(tfm);
if (!len)
curve25519_generate_secret(secret);
else if (len == CURVE25519_KEY_SIZE &&
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
memcpy(secret, buf, CURVE25519_KEY_SIZE);
else
return -EINVAL;
return 0;
}
static int curve25519_compute_value(struct kpp_request *req)
{
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
const u8 *secret = kpp_tfm_ctx(tfm);
u8 public_key[CURVE25519_KEY_SIZE];
u8 buf[CURVE25519_KEY_SIZE];
int copied, nbytes;
u8 const *bp;
if (req->src) {
copied = sg_copy_to_buffer(req->src,
sg_nents_for_len(req->src,
CURVE25519_KEY_SIZE),
public_key, CURVE25519_KEY_SIZE);
if (copied != CURVE25519_KEY_SIZE)
return -EINVAL;
bp = public_key;
} else {
bp = curve25519_base_point;
}
curve25519_arch(buf, secret, bp);
/* might want less than we've got */
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
nbytes),
buf, nbytes);
if (copied != nbytes)
return -EINVAL;
return 0;
}
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
{
return CURVE25519_KEY_SIZE;
}
static struct kpp_alg curve25519_alg = {
.base.cra_name = "curve25519",
.base.cra_driver_name = "curve25519-neon",
.base.cra_priority = 200,
.base.cra_module = THIS_MODULE,
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
.set_secret = curve25519_set_secret,
.generate_public_key = curve25519_compute_value,
.compute_shared_secret = curve25519_compute_value,
.max_size = curve25519_max_size,
};
static int __init mod_init(void)
{
if (elf_hwcap & HWCAP_NEON) {
static_branch_enable(&have_neon);
return crypto_register_kpp(&curve25519_alg);
}
return 0;
}
static void __exit mod_exit(void)
{
if (elf_hwcap & HWCAP_NEON)
crypto_unregister_kpp(&curve25519_alg);
}
module_init(mod_init);
module_exit(mod_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-neon");
MODULE_LICENSE("GPL v2");