Add several unsafe optimizations, special trade-off decisions

Partially based on https://github.com/llamasoft/secp256k1_fast_unsafe , adopted for newer secp256k1.
This commit is contained in:
Christian Reitter 2023-12-09 19:01:52 +01:00
parent 6648126c69
commit 43aea399d1
17 changed files with 1294533 additions and 15 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
target/
Cargo.lock
.gdb_history

View File

@ -1,6 +1,6 @@
[package]
name = "secp256k1"
version = "0.29.0"
version = "0.99.0"
authors = [ "Dawid Ciężarkiewicz <dpc@ucore.info>",
"Andrew Poelstra <apoelstra@wpsoftware.net>" ]
license = "CC0-1.0"
@ -36,7 +36,7 @@ global-context = ["std"]
global-context-less-secure = ["global-context"]
[dependencies]
secp256k1-sys = { version = "0.10.0", default-features = false, path = "./secp256k1-sys" }
secp256k1-sys = { version = "0.99.0", default-features = false, path = "./secp256k1-sys" }
serde = { version = "1.0.103", default-features = false, optional = true }
# You likely only want to enable these if you explicitly do not want to use "std", otherwise enable

View File

@ -1,6 +1,6 @@
[package]
name = "secp256k1-sys"
version = "0.10.0"
version = "0.99.0"
authors = [ "Dawid Ciężarkiewicz <dpc@ucore.info>",
"Andrew Poelstra <apoelstra@wpsoftware.net>",
"Steven Roose <steven@stevenroose.org>" ]

View File

@ -35,8 +35,29 @@ fn main() {
base_config.define("ECMULT_WINDOW_SIZE", Some("4")); // A low-enough value to consume negligible memory
base_config.define("ECMULT_GEN_PREC_BITS", Some("2"));
} else {
base_config.define("ECMULT_GEN_PREC_BITS", Some("4"));
// definitely slower, not meaningful
// base_config.define("ECMULT_GEN_PREC_BITS", Some("2"));
// base_config.define("ECMULT_GEN_PREC_BITS", Some("4"));
// going from the default 4 to 8 is a huge performance increase
// when combined with other optimizations
base_config.define("ECMULT_GEN_PREC_BITS", Some("8"));
// TODO highly experimental
// non-standard optimization
// slow compilation
// base_config.define("ECMULT_GEN_PREC_BITS", Some("16"));
// minor factor, the 15 default is already optimized
base_config.define("ECMULT_WINDOW_SIZE", Some("15")); // This is the default in the configure file (`auto`)
// TODO do some fine tuning for a slightly higher than default ECMULT_WINDOW_SIZE
// base_config.define("ECMULT_WINDOW_SIZE", Some("17"));
// base_config.define("ECMULT_WINDOW_SIZE", Some("18"));
// too large
// base_config.define("ECMULT_WINDOW_SIZE", Some("22"));
}
base_config.define("USE_EXTERNAL_DEFAULT_CALLBACKS", Some("1"));
#[cfg(feature = "recovery")]
@ -54,12 +75,60 @@ fn main() {
.file("depend/secp256k1/src/precomputed_ecmult.c")
.file("depend/secp256k1/src/secp256k1.c");
// default CC is gcc on the tested system
// GCC 12.2 slightly faster than clang-18 for the unmodified code?
// for the modified code, clang is slightly faster?
// // force clang
base_config.compiler("clang");
// // gcc is faster without native CPU settings on a Ryzen Zen 3 desktop CPU
base_config.flag("-march=native");
// doesn't work as-is
// base_config.flag("-flto=thin");
// shouldn't do much
base_config.define("NDEBUG", Some(""));
// enable custom modifications on the codebase
base_config.define("OPTIMIZE_UNSAFE_VAR_VARIANT", Some(""));
base_config.define("OPTIMIZE_UNSAFE_SHORTCIRCUIT", Some(""));
base_config.define("OPTIMIZE_UNSAFE_SKIP_ZEROING", Some(""));
base_config.define("OPTIMIZE_UNSAFE_ECMULT_GEN_CORE", Some(""));
base_config.define("OPTIMIZE_UNSAFE_SKIP_MASKING", Some(""));
// This is slower, and removed in a future secp256k1 code version
// base_config.define("USE_ASM_X86_64", Some(""));
// -O3 seems to be better than -O2 for clang
// -O3 is already the default for --release builds
// set anyway so that rust non-release builds don't fall back to -O1 here
base_config.flag("-O3");
// base_config.flag("-O1");
// Extra slow, for benchmark testing
// base_config.flag("-Os");
// the default seems to set -gdwarf-4
// slightly worse on gcc?
// slightly better on clang-18 ?
// base_config.flag("-g0");
// potential minor improvement
// to counteract the -fno-omit-frame-pointer
// base_config.flag("-fomit-frame-pointer");
if base_config.try_compile("libsecp256k1.a").is_err() {
// Some embedded platforms may not have, eg, string.h available, so if the build fails
// simply try again with the wasm sysroot (but without the wasm type sizes) in the hopes
// that it works.
base_config.include("wasm/wasm-sysroot");
base_config.compile("libsecp256k1.a");
// base_config.include("wasm/wasm-sysroot");
// base_config.compile("libsecp256k1.a");
}
}

View File

@ -12,7 +12,7 @@
#include "scratch.h"
#ifndef ECMULT_WINDOW_SIZE
# define ECMULT_WINDOW_SIZE 15
# define ECMULT_WINDOW_SIZE 19
# ifdef DEBUG_CONFIG
# pragma message DEBUG_CONFIG_MSG("ECMULT_WINDOW_SIZE undefined, assuming default value")
# endif

View File

@ -21,7 +21,7 @@
# pragma message DEBUG_CONFIG_DEF(ECMULT_GEN_PREC_BITS)
#endif
#if ECMULT_GEN_PREC_BITS != 2 && ECMULT_GEN_PREC_BITS != 4 && ECMULT_GEN_PREC_BITS != 8
#if ECMULT_GEN_PREC_BITS != 2 && ECMULT_GEN_PREC_BITS != 4 && ECMULT_GEN_PREC_BITS != 8 && ECMULT_GEN_PREC_BITS != 16
# error "Set ECMULT_GEN_PREC_BITS to 2, 4 or 8."
#endif

View File

@ -44,21 +44,29 @@ static void rustsecp256k1_v0_10_0_ecmult_gen_context_clear(rustsecp256k1_v0_10_0
*/
static void rustsecp256k1_v0_10_0_ecmult_gen(const rustsecp256k1_v0_10_0_ecmult_gen_context *ctx, rustsecp256k1_v0_10_0_gej *r, const rustsecp256k1_v0_10_0_scalar *gn) {
int bits = ECMULT_GEN_PREC_BITS;
#ifndef OPTIMIZE_UNSAFE_ECMULT_GEN_CORE
int g = ECMULT_GEN_PREC_G(bits);
#endif
int n = ECMULT_GEN_PREC_N(bits);
rustsecp256k1_v0_10_0_ge add;
#ifndef OPTIMIZE_UNSAFE_ECMULT_GEN_CORE
rustsecp256k1_v0_10_0_ge_storage adds;
#endif
rustsecp256k1_v0_10_0_scalar gnb;
int i, j, n_i;
#ifndef OPTIMIZE_UNSAFE_SKIP_ZEROING
memset(&adds, 0, sizeof(adds));
#endif
*r = ctx->initial;
/* Blind scalar/point multiplication by computing (n-b)G + bG instead of nG. */
rustsecp256k1_v0_10_0_scalar_add(&gnb, gn, &ctx->blind);
add.infinity = 0;
for (i = 0; i < n; i++) {
n_i = rustsecp256k1_v0_10_0_scalar_get_bits(&gnb, i * bits, bits);
#ifndef OPTIMIZE_UNSAFE_ECMULT_GEN_CORE
for (j = 0; j < g; j++) {
/** This uses a conditional move to avoid any secret data in array indexes.
* _Any_ use of secret indexes has been demonstrated to result in timing
@ -74,10 +82,20 @@ static void rustsecp256k1_v0_10_0_ecmult_gen(const rustsecp256k1_v0_10_0_ecmult_
}
rustsecp256k1_v0_10_0_ge_from_storage(&add, &adds);
rustsecp256k1_v0_10_0_gej_add_ge(r, r, &add);
#else
// Mostly untested
// based on https://github.com/bitcoin-core/secp256k1/commit/ec776ebdf42956e847b13212f489ca78eb6920c8
// unlocks major performance improvement
rustsecp256k1_v0_10_0_ge_from_storage(&add, &rustsecp256k1_v0_10_0_ecmult_gen_prec_table[i][n_i]);
rustsecp256k1_v0_10_0_gej_add_ge_var(r, r, &add, NULL);
(void)j;
#endif
}
#ifndef OPTIMIZE_UNSAFE_SKIP_ZEROING
n_i = 0;
rustsecp256k1_v0_10_0_ge_clear(&add);
rustsecp256k1_v0_10_0_scalar_clear(&gnb);
#endif
}
/* Setup blinding values for rustsecp256k1_v0_10_0_ecmult_gen. */
@ -104,7 +122,9 @@ static void rustsecp256k1_v0_10_0_ecmult_gen_blind(rustsecp256k1_v0_10_0_ecmult_
VERIFY_CHECK(seed32 != NULL);
memcpy(keydata + 32, seed32, 32);
rustsecp256k1_v0_10_0_rfc6979_hmac_sha256_initialize(&rng, keydata, 64);
#ifndef OPTIMIZE_UNSAFE_SKIP_ZEROING
memset(keydata, 0, sizeof(keydata));
#endif
rustsecp256k1_v0_10_0_rfc6979_hmac_sha256_generate(&rng, nonce32, 32);
rustsecp256k1_v0_10_0_fe_set_b32_mod(&s, nonce32);
rustsecp256k1_v0_10_0_fe_cmov(&s, &rustsecp256k1_v0_10_0_fe_one, rustsecp256k1_v0_10_0_fe_normalizes_to_zero(&s));
@ -117,14 +137,18 @@ static void rustsecp256k1_v0_10_0_ecmult_gen_blind(rustsecp256k1_v0_10_0_ecmult_
/* A blinding value of 0 works, but would undermine the projection hardening. */
rustsecp256k1_v0_10_0_scalar_cmov(&b, &rustsecp256k1_v0_10_0_scalar_one, rustsecp256k1_v0_10_0_scalar_is_zero(&b));
rustsecp256k1_v0_10_0_rfc6979_hmac_sha256_finalize(&rng);
#ifndef OPTIMIZE_UNSAFE_SKIP_ZEROING
memset(nonce32, 0, 32);
#endif
/* The random projection in ctx->initial ensures that gb will have a random projection. */
rustsecp256k1_v0_10_0_ecmult_gen(ctx, &gb, &b);
rustsecp256k1_v0_10_0_scalar_negate(&b, &b);
ctx->blind = b;
ctx->initial = gb;
#ifndef OPTIMIZE_UNSAFE_SKIP_ZEROING
rustsecp256k1_v0_10_0_scalar_clear(&b);
rustsecp256k1_v0_10_0_gej_clear(&gb);
#endif
}
#endif /* SECP256K1_ECMULT_GEN_IMPL_H */

View File

@ -75,10 +75,19 @@ static const rustsecp256k1_v0_10_0_fe rustsecp256k1_v0_10_0_const_beta = SECP256
/* In non-VERIFY mode, we #define the fe operations to be identical to their
* internal field implementation, to avoid the potential overhead of a
* function call (even though presumably inlinable). */
#ifdef OPTIMIZE_UNSAFE_VAR_VARIANT
# define rustsecp256k1_v0_10_0_fe_normalize rustsecp256k1_v0_10_0_fe_impl_normalize_var
#else
# define rustsecp256k1_v0_10_0_fe_normalize rustsecp256k1_v0_10_0_fe_impl_normalize
#endif
# define rustsecp256k1_v0_10_0_fe_normalize_weak rustsecp256k1_v0_10_0_fe_impl_normalize_weak
# define rustsecp256k1_v0_10_0_fe_normalize_var rustsecp256k1_v0_10_0_fe_impl_normalize_var
#ifdef OPTIMIZE_UNSAFE_VAR_VARIANT
# define rustsecp256k1_v0_10_0_fe_normalizes_to_zero rustsecp256k1_v0_10_0_fe_impl_normalizes_to_zero_var
#else
# define rustsecp256k1_v0_10_0_fe_normalizes_to_zero rustsecp256k1_v0_10_0_fe_impl_normalizes_to_zero
#endif
# define rustsecp256k1_v0_10_0_fe_normalizes_to_zero_var rustsecp256k1_v0_10_0_fe_impl_normalizes_to_zero_var
# define rustsecp256k1_v0_10_0_fe_set_int rustsecp256k1_v0_10_0_fe_impl_set_int
# define rustsecp256k1_v0_10_0_fe_clear rustsecp256k1_v0_10_0_fe_impl_clear
@ -96,7 +105,11 @@ static const rustsecp256k1_v0_10_0_fe rustsecp256k1_v0_10_0_const_beta = SECP256
# define rustsecp256k1_v0_10_0_fe_cmov rustsecp256k1_v0_10_0_fe_impl_cmov
# define rustsecp256k1_v0_10_0_fe_to_storage rustsecp256k1_v0_10_0_fe_impl_to_storage
# define rustsecp256k1_v0_10_0_fe_from_storage rustsecp256k1_v0_10_0_fe_impl_from_storage
#ifdef OPTIMIZE_UNSAFE_VAR_VARIANT
# define rustsecp256k1_v0_10_0_fe_inv rustsecp256k1_v0_10_0_fe_impl_inv_var
#else
# define rustsecp256k1_v0_10_0_fe_inv rustsecp256k1_v0_10_0_fe_impl_inv
#endif
# define rustsecp256k1_v0_10_0_fe_inv_var rustsecp256k1_v0_10_0_fe_impl_inv_var
# define rustsecp256k1_v0_10_0_fe_get_bounds rustsecp256k1_v0_10_0_fe_impl_get_bounds
# define rustsecp256k1_v0_10_0_fe_half rustsecp256k1_v0_10_0_fe_impl_half

View File

@ -1236,4 +1236,9 @@ static int rustsecp256k1_v0_10_0_fe_impl_is_square_var(const rustsecp256k1_v0_10
return ret;
}
#ifdef OPTIMIZE_UNSAFE_VAR_VARIANT
// BREAKING OPTIMIZATION
#define rustsecp256k1_v0_10_0_fe_impl_inv(r, x) rustsecp256k1_v0_10_0_fe_impl_inv_var(r, x)
#endif
#endif /* SECP256K1_FIELD_REPR_IMPL_H */

View File

@ -176,7 +176,11 @@ static int rustsecp256k1_v0_10_0_fe_impl_normalizes_to_zero_var(const rustsecp25
z1 = z0 ^ 0x1000003D0ULL;
/* Fast return path should catch the majority of cases */
#ifndef OPTIMIZE_UNSAFE_SHORTCIRCUIT
if ((z0 != 0ULL) & (z1 != 0xFFFFFFFFFFFFFULL)) {
#else
if ((z0 != 0ULL) && (z1 != 0xFFFFFFFFFFFFFULL)) {
#endif
return 0;
}
@ -205,7 +209,17 @@ SECP256K1_INLINE static void rustsecp256k1_v0_10_0_fe_impl_set_int(rustsecp256k1
SECP256K1_INLINE static int rustsecp256k1_v0_10_0_fe_impl_is_zero(const rustsecp256k1_v0_10_0_fe *a) {
const uint64_t *t = a->n;
#ifndef OPTIMIZE_UNSAFE_SHORTCIRCUIT
return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0;
#else
return (
t[0] == 0
&& t[1] == 0
&& t[2] == 0
&& t[3] == 0
&& t[4] == 0
);
#endif
}
SECP256K1_INLINE static int rustsecp256k1_v0_10_0_fe_impl_is_odd(const rustsecp256k1_v0_10_0_fe *a) {
@ -354,6 +368,7 @@ SECP256K1_INLINE static void rustsecp256k1_v0_10_0_fe_impl_sqr(rustsecp256k1_v0_
}
SECP256K1_INLINE static void rustsecp256k1_v0_10_0_fe_impl_cmov(rustsecp256k1_v0_10_0_fe *r, const rustsecp256k1_v0_10_0_fe *a, int flag) {
#ifndef OPTIMIZE_UNSAFE_SKIP_MASKING
uint64_t mask0, mask1;
volatile int vflag = flag;
SECP256K1_CHECKMEM_CHECK_VERIFY(r->n, sizeof(r->n));
@ -364,6 +379,16 @@ SECP256K1_INLINE static void rustsecp256k1_v0_10_0_fe_impl_cmov(rustsecp256k1_v0
r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
#else
// based on https://github.com/bitcoin-core/secp256k1/commit/ec776ebdf42956e847b13212f489ca78eb6920c8
if ( !flag ) { return; }
r->n[0] = a->n[0];
r->n[1] = a->n[1];
r->n[2] = a->n[2];
r->n[3] = a->n[3];
r->n[4] = a->n[4];
#endif
}
static SECP256K1_INLINE void rustsecp256k1_v0_10_0_fe_impl_half(rustsecp256k1_v0_10_0_fe *r) {
@ -421,6 +446,7 @@ static SECP256K1_INLINE void rustsecp256k1_v0_10_0_fe_impl_half(rustsecp256k1_v0
}
static SECP256K1_INLINE void rustsecp256k1_v0_10_0_fe_storage_cmov(rustsecp256k1_v0_10_0_fe_storage *r, const rustsecp256k1_v0_10_0_fe_storage *a, int flag) {
/* POTENTIAL BREAKING OPTIMIZATION NOTE: this could be optimized but is likely no longer used, so not modified */
uint64_t mask0, mask1;
volatile int vflag = flag;
SECP256K1_CHECKMEM_CHECK_VERIFY(r->n, sizeof(r->n));
@ -526,4 +552,9 @@ static int rustsecp256k1_v0_10_0_fe_impl_is_square_var(const rustsecp256k1_v0_10
return ret;
}
#ifdef OPTIMIZE_UNSAFE
// BREAKING OPTIMIZATION
#define rustsecp256k1_v0_10_0_fe_impl_inv(r, x) rustsecp256k1_v0_10_0_fe_impl_inv_var(r, x)
#endif
#endif /* SECP256K1_FIELD_REPR_IMPL_H */

View File

@ -459,4 +459,9 @@ SECP256K1_INLINE static void rustsecp256k1_v0_10_0_fe_half(rustsecp256k1_v0_10_0
#endif /* defined(VERIFY) */
#ifdef OPTIMIZE_UNSAFE_VAR_VARIANT
// BREAKING OPTIMIZATION
#define rustsecp256k1_v0_10_0_fe_impl_inv(r, x) rustsecp256k1_v0_10_0_fe_impl_inv_var(r, x)
#endif
#endif /* SECP256K1_FIELD_IMPL_H */

View File

@ -941,4 +941,10 @@ static int rustsecp256k1_v0_10_0_ge_x_frac_on_curve_var(const rustsecp256k1_v0_1
return rustsecp256k1_v0_10_0_fe_is_square_var(&r);
}
#ifdef OPTIMIZE_UNSAFE_VAR_VARIANT
/* BREAKING OPTIMIZATION Force callers to use variable runtime versions */
#define rustsecp256k1_v0_10_0_ge_set_gej(r, a) rustsecp256k1_v0_10_0_ge_set_gej_var(r, a)
#endif
#endif /* SECP256K1_GROUP_IMPL_H */

View File

@ -54,8 +54,8 @@ static void print_two_tables(FILE *fp, int window_g) {
}
int main(void) {
/* Always compute all tables for window sizes up to 15. */
int window_g = (ECMULT_WINDOW_SIZE < 15) ? 15 : ECMULT_WINDOW_SIZE;
/* Always compute all tables for window sizes up to 19. */
int window_g = (ECMULT_WINDOW_SIZE < 19) ? 19 : ECMULT_WINDOW_SIZE;
const char outfile[] = "src/precomputed_ecmult.c";
FILE* fp;

View File

@ -42,7 +42,8 @@ int main(int argc, char **argv) {
fprintf(fp, "#define S(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) SECP256K1_GE_STORAGE_CONST(0x##a##u,0x##b##u,0x##c##u,0x##d##u,0x##e##u,0x##f##u,0x##g##u,0x##h##u,0x##i##u,0x##j##u,0x##k##u,0x##l##u,0x##m##u,0x##n##u,0x##o##u,0x##p##u)\n");
fprintf(fp, "const rustsecp256k1_v0_10_0_ge_storage rustsecp256k1_v0_10_0_ecmult_gen_prec_table[ECMULT_GEN_PREC_N(ECMULT_GEN_PREC_BITS)][ECMULT_GEN_PREC_G(ECMULT_GEN_PREC_BITS)] = {\n");
for (bits = 2; bits <= 8; bits *= 2) {
/* for (bits = 2; bits <= 8; bits *= 2) { */
for (bits = 2; bits <= 16; bits *= 2) {
int g = ECMULT_GEN_PREC_G(bits);
int n = ECMULT_GEN_PREC_N(bits);
int inner, outer;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -140,9 +140,17 @@ static const rustsecp256k1_v0_10_0_callback default_error_callback = {
#endif
static SECP256K1_INLINE void *checked_malloc(const rustsecp256k1_v0_10_0_callback* cb, size_t size) {
(void) cb;
/*(void) cb;
(void) size;
return NULL;
return NULL;*/
/* restore original functionality to allow building */
void *ret = malloc(size);
if (ret == NULL) {
/* optimized */
/* secp256k1_callback_call(cb, "Out of memory"); */
}
return ret;
}
#if defined(__BIGGEST_ALIGNMENT__)