Merge rust-bitcoin/rust-bitcoin#1990: Introduce the `small-hash` feature for `bitcoin_hashes`

f2c5f19557 Introduce the `small-hash` feature for `bitcoin_hashes` (Alekos Filini)

Pull request description:

  When enabled this feature swaps the hash implementation of sha512, sha256 and ripemd160 for a smaller (but also slower) one.

  On embedded processors (Cortex-M4) it can lead to up to a 52% size reduction, from around 37KiB for just the `process_block` methods of the three hash functions to 17.8KiB.

  The following numbers were collected on `aarch64-unknown-linux-gnu` with `cargo 1.72.0-nightly`.

  ## Original

  ```
  RUSTFLAGS='--cfg=bench -C opt-level=z' cargo bench
  ```

  ```
  test hash160::benches::hash160_10                 ... bench:          33 ns/iter (+/- 1) = 303 MB/s
  test hash160::benches::hash160_1k                 ... bench:       2,953 ns/iter (+/- 187) = 346 MB/s
  test hash160::benches::hash160_64k                ... bench:     188,480 ns/iter (+/- 11,595) = 347 MB/s
  test hmac::benches::hmac_sha256_10                ... bench:          33 ns/iter (+/- 2) = 303 MB/s
  test hmac::benches::hmac_sha256_1k                ... bench:       2,957 ns/iter (+/- 104) = 346 MB/s
  test hmac::benches::hmac_sha256_64k               ... bench:     192,022 ns/iter (+/- 6,407) = 341 MB/s
  test ripemd160::benches::ripemd160_10             ... bench:          25 ns/iter (+/- 1) = 400 MB/s
  test ripemd160::benches::ripemd160_1k             ... bench:       2,288 ns/iter (+/- 93) = 447 MB/s
  test ripemd160::benches::ripemd160_64k            ... bench:     146,823 ns/iter (+/- 1,102) = 446 MB/s
  test sha1::benches::sha1_10                       ... bench:          41 ns/iter (+/- 0) = 243 MB/s
  test sha1::benches::sha1_1k                       ... bench:       3,844 ns/iter (+/- 70) = 266 MB/s
  test sha1::benches::sha1_64k                      ... bench:     245,854 ns/iter (+/- 10,158) = 266 MB/s
  test sha256::benches::sha256_10                   ... bench:          35 ns/iter (+/- 0) = 285 MB/s
  test sha256::benches::sha256_1k                   ... bench:       3,063 ns/iter (+/- 15) = 334 MB/s
  test sha256::benches::sha256_64k                  ... bench:     195,729 ns/iter (+/- 2,880) = 334 MB/s
  test sha256d::benches::sha256d_10                 ... bench:          34 ns/iter (+/- 1) = 294 MB/s
  test sha256d::benches::sha256d_1k                 ... bench:       3,071 ns/iter (+/- 107) = 333 MB/s
  test sha256d::benches::sha256d_64k                ... bench:     188,614 ns/iter (+/- 8,101) = 347 MB/s
  test sha512::benches::sha512_10                   ... bench:          21 ns/iter (+/- 0) = 476 MB/s
  test sha512::benches::sha512_1k                   ... bench:       1,714 ns/iter (+/- 36) = 597 MB/s
  test sha512::benches::sha512_64k                  ... bench:     110,084 ns/iter (+/- 3,637) = 595 MB/s
  test sha512_256::benches::sha512_256_10           ... bench:          22 ns/iter (+/- 1) = 454 MB/s
  test sha512_256::benches::sha512_256_1k           ... bench:       1,822 ns/iter (+/- 70) = 562 MB/s
  test sha512_256::benches::sha512_256_64k          ... bench:     116,231 ns/iter (+/- 4,745) = 563 MB/s
  test siphash24::benches::siphash24_1ki            ... bench:       1,072 ns/iter (+/- 41) = 955 MB/s
  test siphash24::benches::siphash24_1ki_hash       ... bench:       1,102 ns/iter (+/- 42) = 929 MB/s
  test siphash24::benches::siphash24_1ki_hash_u64   ... bench:       1,064 ns/iter (+/- 41) = 962 MB/s
  test siphash24::benches::siphash24_64ki           ... bench:      69,957 ns/iter (+/- 2,712) = 936 MB/
  ```

  ```
  0000000000005872 t _ZN84_$LT$bitcoin_hashes..ripemd160..HashEngine$u20$as$u20$bitcoin_hashes..HashEngine$GT$5input17hc4800746a9da7ff4E
  0000000000007956 t _ZN81_$LT$bitcoin_hashes..sha256..HashEngine$u20$as$u20$bitcoin_hashes..HashEngine$GT$5input17hf49345f65130ce9bE
  0000000000008024 t _ZN14bitcoin_hashes6sha2568Midstate10const_hash17h57317bc8012004b4E.llvm.441255102889972912
  0000000000010528 t _ZN81_$LT$bitcoin_hashes..sha512..HashEngine$u20$as$u20$bitcoin_hashes..HashEngine$GT$5input17h9bc868d4392bd9acE
  ```

  Total size: 32380 bytes

  ## With `small-hash` enabled

  ```
  RUSTFLAGS='--cfg=bench -C opt-level=z' cargo bench --features small-hash
  ```

  ```
  test hash160::benches::hash160_10                 ... bench:          52 ns/iter (+/- 3) = 192 MB/s
  test hash160::benches::hash160_1k                 ... bench:       4,817 ns/iter (+/- 286) = 212 MB/s
  test hash160::benches::hash160_64k                ... bench:     319,572 ns/iter (+/- 11,031) = 205 MB/s
  test hmac::benches::hmac_sha256_10                ... bench:          54 ns/iter (+/- 2) = 185 MB/s
  test hmac::benches::hmac_sha256_1k                ... bench:       4,846 ns/iter (+/- 204) = 211 MB/s
  test hmac::benches::hmac_sha256_64k               ... bench:     319,114 ns/iter (+/- 4,451) = 205 MB/s
  test ripemd160::benches::ripemd160_10             ... bench:          27 ns/iter (+/- 0) = 370 MB/s
  test ripemd160::benches::ripemd160_1k             ... bench:       2,358 ns/iter (+/- 150) = 434 MB/s
  test ripemd160::benches::ripemd160_64k            ... bench:     154,573 ns/iter (+/- 3,954) = 423 MB/s
  test sha1::benches::sha1_10                       ... bench:          41 ns/iter (+/- 1) = 243 MB/s
  test sha1::benches::sha1_1k                       ... bench:       3,700 ns/iter (+/- 243) = 276 MB/s
  test sha1::benches::sha1_64k                      ... bench:     231,039 ns/iter (+/- 13,989) = 283 MB/s
  test sha256::benches::sha256_10                   ... bench:          51 ns/iter (+/- 3) = 196 MB/s
  test sha256::benches::sha256_1k                   ... bench:       4,823 ns/iter (+/- 182) = 212 MB/s
  test sha256::benches::sha256_64k                  ... bench:     299,960 ns/iter (+/- 17,545) = 218 MB/s
  test sha256d::benches::sha256d_10                 ... bench:          52 ns/iter (+/- 2) = 192 MB/s
  test sha256d::benches::sha256d_1k                 ... bench:       4,827 ns/iter (+/- 323) = 212 MB/s
  test sha256d::benches::sha256d_64k                ... bench:     302,844 ns/iter (+/- 15,796) = 216 MB/s
  test sha512::benches::sha512_10                   ... bench:          34 ns/iter (+/- 1) = 294 MB/s
  test sha512::benches::sha512_1k                   ... bench:       3,002 ns/iter (+/- 123) = 341 MB/s
  test sha512::benches::sha512_64k                  ... bench:     189,767 ns/iter (+/- 10,396) = 345 MB/s
  test sha512_256::benches::sha512_256_10           ... bench:          34 ns/iter (+/- 1) = 294 MB/s
  test sha512_256::benches::sha512_256_1k           ... bench:       2,996 ns/iter (+/- 198) = 341 MB/s
  test sha512_256::benches::sha512_256_64k          ... bench:     192,024 ns/iter (+/- 8,181) = 341 MB/s
  test siphash24::benches::siphash24_1ki            ... bench:       1,081 ns/iter (+/- 65) = 947 MB/s
  test siphash24::benches::siphash24_1ki_hash       ... bench:       1,083 ns/iter (+/- 63) = 945 MB/s
  test siphash24::benches::siphash24_1ki_hash_u64   ... bench:       1,084 ns/iter (+/- 63) = 944 MB/s
  test siphash24::benches::siphash24_64ki           ... bench:      67,237 ns/iter (+/- 4,185) = 974 MB/s
  ```

  ```
  0000000000005384 t _ZN81_$LT$bitcoin_hashes..sha256..HashEngine$u20$as$u20$bitcoin_hashes..HashEngine$GT$5input17hae341658cf9b880bE
  0000000000005608 t _ZN14bitcoin_hashes9ripemd16010HashEngine13process_block17h3276b13f1e9feef8E.llvm.13618235596061801146
  0000000000005616 t _ZN14bitcoin_hashes6sha2568Midstate10const_hash17h3e6fbef64c15ee00E.llvm.7326223909590351031
  0000000000005944 t _ZN81_$LT$bitcoin_hashes..sha512..HashEngine$u20$as$u20$bitcoin_hashes..HashEngine$GT$5input17h321a237bfbe5c0bbE
  ```

  Total size: 22552 bytes

  ## Conclusion

  On `aarch64` there's overall a ~30% improvement in size, although ripemd160 doesn't really shrink that much (and its performance also aren't impacted much with only a 6% slowdown). sha512 and sha256 instead are almost 40% slower with `small-hash` enabled.

  I don't have performance numbers for other architectures, but in terms of size there was an even larger improvements on `thumbv7em-none-eabihf`, with a 52% size reduction overall:

  ```
     Size          Crate Name
  25.3KiB bitcoin_hashes <bitcoin_hashes[fe467ef2aa3a1470]::sha512::HashEngine as bitcoin_hashes[fe467ef2aa3a1470]::HashEngine>::input
   6.9KiB bitcoin_hashes <bitcoin_hashes[fe467ef2aa3a1470]::sha256::HashEngine as bitcoin_hashes[fe467ef2aa3a1470]::HashEngine>::input
   4.8KiB bitcoin_hashes <bitcoin_hashes[fe467ef2aa3a1470]::ripemd160::HashEngine as bitcoin_hashes[fe467ef2aa3a1470]::HashEngine>::input
  ```

  vs

  ```
    Size          Crate Name
  9.5KiB bitcoin_hashes <bitcoin_hashes[974bb476ef905797]::sha512::HashEngine as bitcoin_hashes[974bb476ef905797]::HashEngine>::input
  4.5KiB bitcoin_hashes <bitcoin_hashes[974bb476ef905797]::ripemd160::HashEngine>::process_block
  3.8KiB bitcoin_hashes <bitcoin_hashes[974bb476ef905797]::sha256::HashEngine as bitcoin_hashes[974bb476ef905797]::HashEngine>::input
  ```

  I'm assuming this is because on more limited architectures the compiler needs to use more instructions to move data in and out of registers (especially for sha512 which ideally would benefit from 64-bit registers), so reusing the code by moving it into functions saves a lot of those instructions.

  Also note that the `const_hash` method on `sha256` causes the compiler to emit two independent implementations. I haven't looked into the code yet, maybe there's a way to merge them so that the non-const `process_block` calls into the const fn.

  -----

  Note: commits are unverified right now because I don't have the keys available, I will sign them after addressing the review comments.

ACKs for top commit:
  apoelstra:
    ACK f2c5f19557
  tcharding:
    ACK f2c5f19557

Tree-SHA512: 1d5eb56324c458660e2571e8cf59895dc31dae9c5427c7ed36f8a0e81ca2e9a0f39026f56b6803df03635cc8b66aee3bf5182d51ab8972d169d56bcfec33771c
This commit is contained in:
Andrew Poelstra 2023-08-17 16:28:11 +00:00
commit 1b3a9d3580
No known key found for this signature in database
GPG Key ID: C588D63CE41B97C1
5 changed files with 159 additions and 38 deletions

View File

@ -1 +1,2 @@
msrv = "1.48.0"
too-many-arguments-threshold = 13

View File

@ -19,6 +19,8 @@ alloc = ["internals/alloc", "hex/alloc"]
serde-std = ["serde/std"]
# If you want I/O you must enable either "std" or "core2".
core2 = ["actual-core2", "hex/core2"]
# Smaller (but slower) implementation of sha256, sha512 and ripemd160
small-hash = []
[package.metadata.docs.rs]
all-features = true

View File

@ -90,6 +90,31 @@ impl crate::HashEngine for HashEngine {
engine_input_impl!();
}
#[cfg(feature = "small-hash")]
#[macro_use]
mod small_hash {
#[rustfmt::skip]
pub(super) fn round(a: u32, _b: u32, c: u32, _d: u32, e: u32,
x: u32, bits: u32, add: u32, round: u32,
) -> (u32, u32) {
let a = a.wrapping_add(round).wrapping_add(x).wrapping_add(add);
let a = a.rotate_left(bits).wrapping_add(e);
let c = c.rotate_left(10);
(a, c)
}
macro_rules! round(
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr,
$x:expr, $bits:expr, $add:expr, $round:expr) => ({
let updates = small_hash::round($a, $b, $c, $d, $e, $x, $bits, $add, $round);
$a = updates.0;
$c = updates.1;
});
);
}
#[cfg(not(feature = "small-hash"))]
macro_rules! round(
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr,
$x:expr, $bits:expr, $add:expr, $round:expr) => ({

View File

@ -190,27 +190,75 @@ impl hex::FromHex for Midstate {
}
}
macro_rules! Ch( ($x:expr, $y:expr, $z:expr) => ($z ^ ($x & ($y ^ $z))) );
macro_rules! Maj( ($x:expr, $y:expr, $z:expr) => (($x & $y) | ($z & ($x | $y))) );
macro_rules! Sigma0( ($x:expr) => ($x.rotate_left(30) ^ $x.rotate_left(19) ^ $x.rotate_left(10)) );
macro_rules! Sigma1( ($x:expr) => ( $x.rotate_left(26) ^ $x.rotate_left(21) ^ $x.rotate_left(7)) );
macro_rules! sigma0( ($x:expr) => ($x.rotate_left(25) ^ $x.rotate_left(14) ^ ($x >> 3)) );
macro_rules! sigma1( ($x:expr) => ($x.rotate_left(15) ^ $x.rotate_left(13) ^ ($x >> 10)) );
#[allow(non_snake_case)]
const fn Ch(x: u32, y: u32, z: u32) -> u32 { z ^ (x & (y ^ z)) }
#[allow(non_snake_case)]
const fn Maj(x: u32, y: u32, z: u32) -> u32 { (x & y) | (z & (x | y)) }
#[allow(non_snake_case)]
const fn Sigma0(x: u32) -> u32 { x.rotate_left(30) ^ x.rotate_left(19) ^ x.rotate_left(10) }
#[allow(non_snake_case)]
const fn Sigma1(x: u32) -> u32 { x.rotate_left(26) ^ x.rotate_left(21) ^ x.rotate_left(7) }
const fn sigma0(x: u32) -> u32 { x.rotate_left(25) ^ x.rotate_left(14) ^ (x >> 3) }
const fn sigma1(x: u32) -> u32 { x.rotate_left(15) ^ x.rotate_left(13) ^ (x >> 10) }
#[cfg(feature = "small-hash")]
#[macro_use]
mod small_hash {
use super::*;
#[rustfmt::skip]
pub(super) const fn round(a: u32, b: u32, c: u32, d: u32, e: u32,
f: u32, g: u32, h: u32, k: u32, w: u32) -> (u32, u32) {
let t1 =
h.wrapping_add(Sigma1(e)).wrapping_add(Ch(e, f, g)).wrapping_add(k).wrapping_add(w);
let t2 = Sigma0(a).wrapping_add(Maj(a, b, c));
(d.wrapping_add(t1), t1.wrapping_add(t2))
}
#[rustfmt::skip]
pub(super) const fn later_round(a: u32, b: u32, c: u32, d: u32, e: u32,
f: u32, g: u32, h: u32, k: u32, w: u32,
w1: u32, w2: u32, w3: u32,
) -> (u32, u32, u32) {
let w = w.wrapping_add(sigma1(w1)).wrapping_add(w2).wrapping_add(sigma0(w3));
let (d, h) = round(a, b, c, d, e, f, g, h, k, w);
(d, h, w)
}
macro_rules! round(
// first round
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr) => (
let t1 = $h.wrapping_add(Sigma1!($e)).wrapping_add(Ch!($e, $f, $g)).wrapping_add($k).wrapping_add($w);
let t2 = Sigma0!($a).wrapping_add(Maj!($a, $b, $c));
let updates = small_hash::round($a, $b, $c, $d, $e, $f, $g, $h, $k, $w);
$d = updates.0;
$h = updates.1;
);
// later rounds we reassign $w before doing the first-round computation
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr, $w1:expr, $w2:expr, $w3:expr) => (
let updates = small_hash::later_round($a, $b, $c, $d, $e, $f, $g, $h, $k, $w, $w1, $w2, $w3);
$d = updates.0;
$h = updates.1;
$w = updates.2;
)
);
}
#[cfg(not(feature = "small-hash"))]
#[macro_use]
mod fast_hash {
macro_rules! round(
// first round
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr) => (
let t1 = $h.wrapping_add(Sigma1($e)).wrapping_add(Ch($e, $f, $g)).wrapping_add($k).wrapping_add($w);
let t2 = Sigma0($a).wrapping_add(Maj($a, $b, $c));
$d = $d.wrapping_add(t1);
$h = t1.wrapping_add(t2);
);
// later rounds we reassign $w before doing the first-round computation
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr, $w1:expr, $w2:expr, $w3:expr) => (
$w = $w.wrapping_add(sigma1!($w1)).wrapping_add($w2).wrapping_add(sigma0!($w3));
$w = $w.wrapping_add(sigma1($w1)).wrapping_add($w2).wrapping_add(sigma0($w3));
round!($a, $b, $c, $d, $e, $f, $g, $h, $k, $w);
)
);
}
impl Midstate {
#[allow(clippy::identity_op)] // more readble

View File

@ -123,27 +123,72 @@ pub(crate) fn from_engine(e: HashEngine) -> Hash {
Hash(hash)
}
macro_rules! Ch( ($x:expr, $y:expr, $z:expr) => ($z ^ ($x & ($y ^ $z))) );
macro_rules! Maj( ($x:expr, $y:expr, $z:expr) => (($x & $y) | ($z & ($x | $y))) );
macro_rules! Sigma0( ($x:expr) => ($x.rotate_left(36) ^ $x.rotate_left(30) ^ $x.rotate_left(25)) );
macro_rules! Sigma1( ($x:expr) => ($x.rotate_left(50) ^ $x.rotate_left(46) ^ $x.rotate_left(23)) );
macro_rules! sigma0( ($x:expr) => ($x.rotate_left(63) ^ $x.rotate_left(56) ^ ($x >> 7)) );
macro_rules! sigma1( ($x:expr) => ($x.rotate_left(45) ^ $x.rotate_left(3) ^ ($x >> 6)) );
#[allow(non_snake_case)]
fn Ch(x: u64, y: u64, z: u64) -> u64 { z ^ (x & (y ^ z)) }
#[allow(non_snake_case)]
fn Maj(x: u64, y: u64, z: u64) -> u64 { (x & y) | (z & (x | y)) }
#[allow(non_snake_case)]
fn Sigma0(x: u64) -> u64 { x.rotate_left(36) ^ x.rotate_left(30) ^ x.rotate_left(25) }
#[allow(non_snake_case)]
fn Sigma1(x: u64) -> u64 { x.rotate_left(50) ^ x.rotate_left(46) ^ x.rotate_left(23) }
fn sigma0(x: u64) -> u64 { x.rotate_left(63) ^ x.rotate_left(56) ^ (x >> 7) }
fn sigma1(x: u64) -> u64 { x.rotate_left(45) ^ x.rotate_left(3) ^ (x >> 6) }
#[cfg(feature = "small-hash")]
#[macro_use]
mod small_hash {
use super::*;
#[rustfmt::skip]
pub(super) fn round(a: u64, b: u64, c: u64, d: &mut u64, e: u64,
f: u64, g: u64, h: &mut u64, k: u64, w: u64,
) {
let t1 =
h.wrapping_add(Sigma1(e)).wrapping_add(Ch(e, f, g)).wrapping_add(k).wrapping_add(w);
let t2 = Sigma0(a).wrapping_add(Maj(a, b, c));
*d = d.wrapping_add(t1);
*h = t1.wrapping_add(t2);
}
#[rustfmt::skip]
pub(super) fn later_round(a: u64, b: u64, c: u64, d: &mut u64, e: u64,
f: u64, g: u64, h: &mut u64, k: u64, w: u64,
w1: u64, w2: u64, w3: u64,
) -> u64 {
let w = w.wrapping_add(sigma1(w1)).wrapping_add(w2).wrapping_add(sigma0(w3));
round(a, b, c, d, e, f, g, h, k, w);
w
}
macro_rules! round(
// first round
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr) => (
let t1 = $h.wrapping_add(Sigma1!($e)).wrapping_add(Ch!($e, $f, $g)).wrapping_add($k).wrapping_add($w);
let t2 = Sigma0!($a).wrapping_add(Maj!($a, $b, $c));
small_hash::round($a, $b, $c, &mut $d, $e, $f, $g, &mut $h, $k, $w)
);
// later rounds we reassign $w before doing the first-round computation
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr, $w1:expr, $w2:expr, $w3:expr) => (
$w = small_hash::later_round($a, $b, $c, &mut $d, $e, $f, $g, &mut $h, $k, $w, $w1, $w2, $w3)
)
);
}
#[cfg(not(feature = "small-hash"))]
#[macro_use]
mod fast_hash {
macro_rules! round(
// first round
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr) => (
let t1 = $h.wrapping_add(Sigma1($e)).wrapping_add(Ch($e, $f, $g)).wrapping_add($k).wrapping_add($w);
let t2 = Sigma0($a).wrapping_add(Maj($a, $b, $c));
$d = $d.wrapping_add(t1);
$h = t1.wrapping_add(t2);
);
// later rounds we reassign $w before doing the first-round computation
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $k:expr, $w:expr, $w1:expr, $w2:expr, $w3:expr) => (
$w = $w.wrapping_add(sigma1!($w1)).wrapping_add($w2).wrapping_add(sigma0!($w3));
$w = $w.wrapping_add(sigma1($w1)).wrapping_add($w2).wrapping_add(sigma0($w3));
round!($a, $b, $c, $d, $e, $f, $g, $h, $k, $w);
)
);
}
impl HashEngine {
// Algorithm copied from libsecp256k1