d540725871
Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
148 lines
4.9 KiB
Diff
148 lines
4.9 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Ard Biesheuvel <ardb@kernel.org>
|
|
Date: Wed, 8 Jul 2020 12:11:18 +0300
|
|
Subject: [PATCH] crypto: x86/chacha-sse3 - use unaligned loads for state array
|
|
|
|
commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream.
|
|
|
|
Due to the fact that the x86 port does not support allocating objects
|
|
on the stack with an alignment that exceeds 8 bytes, we have a rather
|
|
ugly hack in the x86 code for ChaCha to ensure that the state array is
|
|
aligned to 16 bytes, allowing the SSE3 implementation of the algorithm
|
|
to use aligned loads.
|
|
|
|
Given that the performance benefit of using of aligned loads appears to
|
|
be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and
|
|
the fact that this hack has leaked into generic ChaCha code, let's just
|
|
remove it.
|
|
|
|
Cc: Martin Willi <martin@strongswan.org>
|
|
Cc: Herbert Xu <herbert@gondor.apana.org.au>
|
|
Cc: Eric Biggers <ebiggers@kernel.org>
|
|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
|
Reviewed-by: Martin Willi <martin@strongswan.org>
|
|
Reviewed-by: Eric Biggers <ebiggers@google.com>
|
|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
---
|
|
arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++--------
|
|
arch/x86/crypto/chacha_glue.c | 17 ++---------------
|
|
include/crypto/chacha.h | 4 ----
|
|
3 files changed, 10 insertions(+), 27 deletions(-)
|
|
|
|
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
|
|
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
|
|
@@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3)
|
|
FRAME_BEGIN
|
|
|
|
# x0..3 = s0..3
|
|
- movdqa 0x00(%rdi),%xmm0
|
|
- movdqa 0x10(%rdi),%xmm1
|
|
- movdqa 0x20(%rdi),%xmm2
|
|
- movdqa 0x30(%rdi),%xmm3
|
|
+ movdqu 0x00(%rdi),%xmm0
|
|
+ movdqu 0x10(%rdi),%xmm1
|
|
+ movdqu 0x20(%rdi),%xmm2
|
|
+ movdqu 0x30(%rdi),%xmm3
|
|
movdqa %xmm0,%xmm8
|
|
movdqa %xmm1,%xmm9
|
|
movdqa %xmm2,%xmm10
|
|
@@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3)
|
|
# %edx: nrounds
|
|
FRAME_BEGIN
|
|
|
|
- movdqa 0x00(%rdi),%xmm0
|
|
- movdqa 0x10(%rdi),%xmm1
|
|
- movdqa 0x20(%rdi),%xmm2
|
|
- movdqa 0x30(%rdi),%xmm3
|
|
+ movdqu 0x00(%rdi),%xmm0
|
|
+ movdqu 0x10(%rdi),%xmm1
|
|
+ movdqu 0x20(%rdi),%xmm2
|
|
+ movdqu 0x30(%rdi),%xmm3
|
|
|
|
mov %edx,%r8d
|
|
call chacha_permute
|
|
--- a/arch/x86/crypto/chacha_glue.c
|
|
+++ b/arch/x86/crypto/chacha_glue.c
|
|
@@ -14,8 +14,6 @@
|
|
#include <linux/module.h>
|
|
#include <asm/simd.h>
|
|
|
|
-#define CHACHA_STATE_ALIGN 16
|
|
-
|
|
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
|
unsigned int len, int nrounds);
|
|
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
|
@@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8
|
|
|
|
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
|
{
|
|
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
|
|
-
|
|
if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
|
|
hchacha_block_generic(state, stream, nrounds);
|
|
} else {
|
|
@@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
|
|
|
|
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
|
{
|
|
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
|
|
-
|
|
chacha_init_generic(state, key, iv);
|
|
}
|
|
EXPORT_SYMBOL(chacha_init_arch);
|
|
@@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch);
|
|
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
|
int nrounds)
|
|
{
|
|
- state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
|
|
-
|
|
if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
|
|
bytes <= CHACHA_BLOCK_SIZE)
|
|
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
|
@@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
|
|
static int chacha_simd_stream_xor(struct skcipher_request *req,
|
|
const struct chacha_ctx *ctx, const u8 *iv)
|
|
{
|
|
- u32 *state, state_buf[16 + 2] __aligned(8);
|
|
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
|
struct skcipher_walk walk;
|
|
int err;
|
|
|
|
err = skcipher_walk_virt(&walk, req, false);
|
|
|
|
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
|
|
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
|
|
-
|
|
chacha_init_generic(state, ctx->key, iv);
|
|
|
|
while (walk.nbytes > 0) {
|
|
@@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_
|
|
{
|
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
|
- u32 *state, state_buf[16 + 2] __aligned(8);
|
|
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
|
struct chacha_ctx subctx;
|
|
u8 real_iv[16];
|
|
|
|
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
|
|
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
|
|
chacha_init_generic(state, ctx->key, req->iv);
|
|
|
|
if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
|
|
--- a/include/crypto/chacha.h
|
|
+++ b/include/crypto/chacha.h
|
|
@@ -25,11 +25,7 @@
|
|
#define CHACHA_BLOCK_SIZE 64
|
|
#define CHACHAPOLY_IV_SIZE 12
|
|
|
|
-#ifdef CONFIG_X86_64
|
|
-#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
|
|
-#else
|
|
#define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
|
|
-#endif
|
|
|
|
/* 192-bit nonce, then 64-bit stream position */
|
|
#define XCHACHA_IV_SIZE 32
|