d540725871
Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
206 lines
6.9 KiB
Diff
206 lines
6.9 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Ard Biesheuvel <ardb@kernel.org>
|
|
Date: Fri, 8 Nov 2019 13:22:10 +0100
|
|
Subject: [PATCH] crypto: x86/chacha - expose SIMD ChaCha routine as library
|
|
function
|
|
|
|
commit 84e03fa39fbe95a5567d43bff458c6d3b3a23ad1 upstream.
|
|
|
|
Wire the existing x86 SIMD ChaCha code into the new ChaCha library
|
|
interface, so that users of the library interface will get the
|
|
accelerated version when available.
|
|
|
|
Given that calls into the library API will always go through the
|
|
routines in this module if it is enabled, switch to static keys
|
|
to select the optimal implementation available (which may be none
|
|
at all, in which case we defer to the generic implementation for
|
|
all invocations).
|
|
|
|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
---
|
|
arch/x86/crypto/chacha_glue.c | 91 +++++++++++++++++++++++++----------
|
|
crypto/Kconfig | 1 +
|
|
include/crypto/chacha.h | 6 +++
|
|
3 files changed, 73 insertions(+), 25 deletions(-)
|
|
|
|
--- a/arch/x86/crypto/chacha_glue.c
|
|
+++ b/arch/x86/crypto/chacha_glue.c
|
|
@@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u
|
|
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
|
|
-#ifdef CONFIG_AS_AVX2
|
|
+
|
|
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
-static bool chacha_use_avx2;
|
|
-#ifdef CONFIG_AS_AVX512
|
|
+
|
|
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
-static bool chacha_use_avx512vl;
|
|
-#endif
|
|
-#endif
|
|
+
|
|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
|
|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
|
|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
|
|
|
|
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
|
|
{
|
|
@@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsig
|
|
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int bytes, int nrounds)
|
|
{
|
|
-#ifdef CONFIG_AS_AVX2
|
|
-#ifdef CONFIG_AS_AVX512
|
|
- if (chacha_use_avx512vl) {
|
|
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
+ static_branch_likely(&chacha_use_avx512vl)) {
|
|
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
|
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
|
nrounds);
|
|
@@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8
|
|
return;
|
|
}
|
|
}
|
|
-#endif
|
|
- if (chacha_use_avx2) {
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
|
+ static_branch_likely(&chacha_use_avx2)) {
|
|
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
|
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
|
bytes -= CHACHA_BLOCK_SIZE * 8;
|
|
@@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8
|
|
return;
|
|
}
|
|
}
|
|
-#endif
|
|
+
|
|
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
|
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
|
bytes -= CHACHA_BLOCK_SIZE * 4;
|
|
@@ -123,6 +123,43 @@ static void chacha_dosimd(u32 *state, u8
|
|
}
|
|
}
|
|
|
|
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
|
+{
|
|
+ state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
|
|
+
|
|
+ if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
|
|
+ hchacha_block_generic(state, stream, nrounds);
|
|
+ } else {
|
|
+ kernel_fpu_begin();
|
|
+ hchacha_block_ssse3(state, stream, nrounds);
|
|
+ kernel_fpu_end();
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL(hchacha_block_arch);
|
|
+
|
|
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
|
+{
|
|
+ state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
|
|
+
|
|
+ chacha_init_generic(state, key, iv);
|
|
+}
|
|
+EXPORT_SYMBOL(chacha_init_arch);
|
|
+
|
|
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
|
+ int nrounds)
|
|
+{
|
|
+ state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
|
|
+
|
|
+ if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
|
|
+ bytes <= CHACHA_BLOCK_SIZE)
|
|
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
|
+
|
|
+ kernel_fpu_begin();
|
|
+ chacha_dosimd(state, dst, src, bytes, nrounds);
|
|
+ kernel_fpu_end();
|
|
+}
|
|
+EXPORT_SYMBOL(chacha_crypt_arch);
|
|
+
|
|
static int chacha_simd_stream_xor(struct skcipher_request *req,
|
|
const struct chacha_ctx *ctx, const u8 *iv)
|
|
{
|
|
@@ -143,7 +180,8 @@ static int chacha_simd_stream_xor(struct
|
|
if (nbytes < walk.total)
|
|
nbytes = round_down(nbytes, walk.stride);
|
|
|
|
- if (!crypto_simd_usable()) {
|
|
+ if (!static_branch_likely(&chacha_use_simd) ||
|
|
+ !crypto_simd_usable()) {
|
|
chacha_crypt_generic(state, walk.dst.virt.addr,
|
|
walk.src.virt.addr, nbytes,
|
|
ctx->nrounds);
|
|
@@ -246,18 +284,21 @@ static struct skcipher_alg algs[] = {
|
|
static int __init chacha_simd_mod_init(void)
|
|
{
|
|
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
|
- return -ENODEV;
|
|
+ return 0;
|
|
|
|
-#ifdef CONFIG_AS_AVX2
|
|
- chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
|
- boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
|
-#ifdef CONFIG_AS_AVX512
|
|
- chacha_use_avx512vl = chacha_use_avx2 &&
|
|
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
|
- boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
|
|
-#endif
|
|
-#endif
|
|
+ static_branch_enable(&chacha_use_simd);
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
|
|
+ static_branch_enable(&chacha_use_avx2);
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
|
|
+ static_branch_enable(&chacha_use_avx512vl);
|
|
+ }
|
|
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
|
|
}
|
|
|
|
--- a/crypto/Kconfig
|
|
+++ b/crypto/Kconfig
|
|
@@ -1418,6 +1418,7 @@ config CRYPTO_CHACHA20_X86_64
|
|
depends on X86 && 64BIT
|
|
select CRYPTO_BLKCIPHER
|
|
select CRYPTO_LIB_CHACHA_GENERIC
|
|
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
|
help
|
|
SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
|
|
XChaCha20, and XChaCha12 stream ciphers.
|
|
--- a/include/crypto/chacha.h
|
|
+++ b/include/crypto/chacha.h
|
|
@@ -25,6 +25,12 @@
|
|
#define CHACHA_BLOCK_SIZE 64
|
|
#define CHACHAPOLY_IV_SIZE 12
|
|
|
|
+#ifdef CONFIG_X86_64
|
|
+#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
|
|
+#else
|
|
+#define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
|
|
+#endif
|
|
+
|
|
/* 192-bit nonce, then 64-bit stream position */
|
|
#define XCHACHA_IV_SIZE 32
|
|
|