Merge rust-bitcoin/rust-bitcoin#1268: Implement basic support for fast hex encoding

040b14ef1a Implement basic support for fast hex encoding (Martin Habovstiak)

Pull request description:

  There's a `hex` module in `bitcoin_hashes` which is a bit out of place and not very fast - it passes each *digit* through dynamic dispatch not only adding overhead but also not allowing `String` to make better allocations.

  This change adds basic support for hex encoding using a stack-allocated buffer which minimizes the overhead of dynamic dispatch. It also provides a new `DisplayHex` trait designed to replace `ToHex` found in `bitcoin_hashes`.

ACKs for top commit:
  apoelstra:
    ACK 040b14ef1a
  tcharding:
    ACK 040b14ef1a

Tree-SHA512: ffb2f46d5e0aa97c73a14067bdde92b7577651de1daee89fcb38b6e464b78a6ce40d39ac6bd573a9eec7274aa4e112ebc537604dca2c0d38da7d647cb548482a
This commit is contained in:
Andrew Poelstra 2022-09-14 01:37:08 +00:00
commit 720af690fc
No known key found for this signature in database
GPG Key ID: C588D63CE41B97C1
5 changed files with 578 additions and 2 deletions

View File

@ -14,7 +14,8 @@ edition = "2018"
# Please don't forget to add relevant features to docs.rs below.
[features]
default = []
std = []
std = ["alloc"]
alloc = []
[package.metadata.docs.rs]
features = ["std"]

View File

@ -0,0 +1,250 @@
//! Implements a buffered encoder.
//!
//! The main type of this module is [`BufEncoder`] which provides buffered hex encoding. Such is
//! faster than the usual `write!(f, "{02x}", b)?` in a for loop because it reduces dynamic
//! dispatch and decreases the number of allocations if a `String` is being created.
pub use out_bytes::OutBytes;
use super::Case;
/// Implements `OutBytes`
///
/// This prevents the rest of the crate from accessing the field of `OutBytes`.
mod out_bytes {
/// A byte buffer that can only be written-into.
///
/// You shouldn't concern yourself with this, just call `BufEncoder::new` with your array.
///
/// This prepares the API for potential future support of `[MaybeUninit<u8>]`. We don't want to use
/// `unsafe` until it's proven to be needed but if it does we have an easy, compatible upgrade
/// option.
///
/// We also don't bother with unsized type because the immutable version is useless and this avoids
/// `unsafe` while we don't want/need it.
pub struct OutBytes<'a>(&'a mut [u8]);
impl<'a> OutBytes<'a> {
/// Returns the first `len` bytes as initialized.
///
/// Not `unsafe` because we don't use `unsafe` (yet).
///
/// ## Panics
///
/// The method panics if `len` is out of bounds.
#[cfg_attr(rust_v_1_46, track_caller)]
pub(crate) fn assume_init(&self, len: usize) -> &[u8] {
&self.0[..len]
}
/// Writes given bytes into the buffer.
///
/// ## Panics
///
/// The method panics if pos is out of bounds or `bytes` don't fit into the buffer.
#[cfg_attr(rust_v_1_46, track_caller)]
pub(crate) fn write(&mut self, pos: usize, bytes: &[u8]) {
self.0[pos..(pos + bytes.len())].copy_from_slice(bytes);
}
/// Returns the length of the buffer.
pub(crate) fn len(&self) -> usize {
self.0.len()
}
}
macro_rules! impl_from_array {
($($len:expr),* $(,)?) => {
$(
impl<'a> From<&'a mut [u8; $len]> for OutBytes<'a> {
fn from(value: &'a mut [u8; $len]) -> Self {
OutBytes(value)
}
}
)*
}
}
// As a sanity check we only provide conversions for even, non-empty arrays.
// Weird lengths 66 and 130 are provided for serialized public keys.
impl_from_array!(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64, 66, 128, 130, 256, 512, 1024, 2048, 4096, 8192);
}
/// Hex-encodes bytes into the provided buffer.
///
/// This is an important building block for fast hex-encoding. Because string writing tools
/// provided by `core::fmt` involve dynamic dispatch and don't allow reserving capacity in strings
/// buffering the hex and then formatting it is significantly faster.
pub struct BufEncoder<'a> {
buf: OutBytes<'a>,
pos: usize,
}
impl<'a> BufEncoder<'a> {
/// Creates an empty `BufEncoder`.
///
/// This is usually used with uninitialized (zeroed) byte array allocated on stack.
/// This can only be constructed with an even-length, non-empty array.
#[inline]
pub fn new<T: Into<OutBytes<'a>>>(buf: T) -> Self {
let buf = buf.into();
BufEncoder {
buf,
pos: 0,
}
}
/// Encodes `byte` as hex in given `case` and appends it to the buffer.
///
/// ## Panics
///
/// The method panics if the buffer is full.
#[inline]
#[cfg_attr(rust_v_1_46, track_caller)]
pub fn put_byte(&mut self, byte: u8, case: Case) {
self.buf.write(self.pos, &super::byte_to_hex(byte, case.table()));
self.pos += 2;
}
/// Encodes `bytes` as hex in given `case` and appends them to the buffer.
///
/// ## Panics
///
/// The method panics if the bytes wouldn't fit the buffer.
#[inline]
#[cfg_attr(rust_v_1_46, track_caller)]
pub fn put_bytes(&mut self, bytes: &[u8], case: Case) {
// Panic if the result wouldn't fit address space to not waste time and give the optimizer
// more opportunities.
let double_len = bytes.len().checked_mul(2).expect("overflow");
assert!(double_len <= self.buf.len() - self.pos);
for byte in bytes {
self.put_byte(*byte, case);
}
}
/// Returns true if no more bytes can be written into the buffer.
#[inline]
pub fn is_full(&self) -> bool {
self.pos == self.buf.len()
}
/// Returns the written bytes as a hex `str`.
#[inline]
pub fn as_str(&self) -> &str {
core::str::from_utf8(self.buf.assume_init(self.pos)).expect("we only write ASCII")
}
/// Resets the buffer to become empty.
#[inline]
pub fn clear(&mut self) {
self.pos = 0;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty() {
let mut buf = [0u8; 2];
let encoder = BufEncoder::new(&mut buf);
assert_eq!(encoder.as_str(), "");
assert!(!encoder.is_full());
}
#[test]
fn single_byte_exact_buf() {
let mut buf = [0u8; 2];
let mut encoder = BufEncoder::new(&mut buf);
encoder.put_byte(42, Case::Lower);
assert_eq!(encoder.as_str(), "2a");
assert!(encoder.is_full());
encoder.clear();
assert!(!encoder.is_full());
encoder.put_byte(42, Case::Upper);
assert_eq!(encoder.as_str(), "2A");
assert!(encoder.is_full());
}
#[test]
fn single_byte_oversized_buf() {
let mut buf = [0u8; 4];
let mut encoder = BufEncoder::new(&mut buf);
encoder.put_byte(42, Case::Lower);
assert_eq!(encoder.as_str(), "2a");
assert!(!encoder.is_full());
encoder.clear();
encoder.put_byte(42, Case::Upper);
assert_eq!(encoder.as_str(), "2A");
assert!(!encoder.is_full());
}
#[test]
fn two_bytes() {
let mut buf = [0u8; 4];
let mut encoder = BufEncoder::new(&mut buf);
encoder.put_byte(42, Case::Lower);
encoder.put_byte(255, Case::Lower);
assert_eq!(encoder.as_str(), "2aff");
assert!(encoder.is_full());
encoder.clear();
assert!(!encoder.is_full());
encoder.put_byte(42, Case::Upper);
encoder.put_byte(255, Case::Upper);
assert_eq!(encoder.as_str(), "2AFF");
assert!(encoder.is_full());
}
#[test]
fn same_as_fmt() {
use core::fmt::{self, Write};
struct Writer {
buf: [u8; 2],
pos: usize,
}
impl Writer {
fn as_str(&self) -> &str {
core::str::from_utf8(&self.buf[..self.pos]).unwrap()
}
}
impl Write for Writer {
fn write_str(&mut self, s: &str) -> fmt::Result {
assert!(self.pos <= 2);
if s.len() > 2 - self.pos {
Err(fmt::Error)
} else {
self.buf[self.pos..(self.pos + s.len())].copy_from_slice(s.as_bytes());
self.pos += s.len();
Ok(())
}
}
}
let mut writer = Writer {
buf: [0u8; 2],
pos: 0,
};
let mut buf = [0u8; 2];
let mut encoder = BufEncoder::new(&mut buf);
for i in 0..=255 {
write!(writer, "{:02x}", i).unwrap();
encoder.put_byte(i, Case::Lower);
assert_eq!(encoder.as_str(), writer.as_str());
writer.pos = 0;
encoder.clear();
}
for i in 0..=255 {
write!(writer, "{:02X}", i).unwrap();
encoder.put_byte(i, Case::Upper);
assert_eq!(encoder.as_str(), writer.as_str());
writer.pos = 0;
encoder.clear();
}
}
}

View File

@ -0,0 +1,259 @@
//! Helpers for displaying bytes as hex strings.
//!
//! This module provides a trait for displaying things as hex as well as an implementation for
//! `&[u8]`.
use core::fmt;
#[cfg(feature = "alloc")]
use crate::prelude::*;
use super::buf_encoder::{BufEncoder, OutBytes};
use super::Case;
/// Extension trait for types that can be displayed as hex.
/// Types that have a single, obvious text representation being hex should **not** implement this
/// trait and simply implement `Display` instead.
///
/// This trait should be generally implemented for references only. We would prefer to use GAT but
/// that is beyond our MSRV. As a lint we require the `IsRef` trait which is implemented for all
/// references.
pub trait DisplayHex: Copy + sealed::IsRef {
/// The type providing [`fmt::Display`] implementation.
///
/// This is usually a wrapper type holding a reference to `Self`.
type Display: fmt::Display;
/// Display `Self` as a continuous sequence of ASCII hex chars.
fn display_hex(self, case: Case) -> Self::Display;
/// Shorthand for `display_hex(Case::Lower)`.
///
/// Avoids the requirement to import the `Case` type.
fn display_lower_hex(self) -> Self::Display {
self.display_hex(Case::Lower)
}
/// Shorthand for `display_hex(Case::Upper)`.
///
/// Avoids the requirement to import the `Case` type.
fn display_upper_hex(self) -> Self::Display {
self.display_hex(Case::Upper)
}
/// Create a lower-hex-encoded string.
///
/// A shorthand for `to_hex_string(Case::Lower)`, so that `Case` doesn't need to be imported.
///
/// This may be faster than `.display_hex().to_string()` because it uses `reserve_suggestion`.
#[cfg(feature = "alloc")]
fn to_lower_hex_string(self) -> String {
self.to_hex_string(Case::Lower)
}
/// Create an upper-hex-encoded string.
///
/// A shorthand for `to_hex_string(Case::Upper)`, so that `Case` doesn't need to be imported.
///
/// This may be faster than `.display_hex().to_string()` because it uses `reserve_suggestion`.
#[cfg(feature = "alloc")]
fn to_upper_hex_string(self) -> String {
self.to_hex_string(Case::Upper)
}
/// Create a hex-encoded string.
///
/// This may be faster than `.display_hex().to_string()` because it uses `reserve_suggestion`.
#[cfg(feature = "alloc")]
fn to_hex_string(self, case: Case) -> String {
let mut string = String::new();
self.append_hex_to_string(case, &mut string);
string
}
/// Appends hex-encoded content to an existing `String`.
///
/// This may be faster than `write!(string, "{}", self.display_hex())` because it uses
/// `reserve_sugggestion`.
#[cfg(feature = "alloc")]
fn append_hex_to_string(self, case: Case, string: &mut String) {
use fmt::Write;
string.reserve(self.hex_reserve_suggestion());
write!(string, "{}", self.display_hex(case)).unwrap_or_else(|_| {
let name = core::any::type_name::<Self::Display>();
// We don't expect `std` to ever be buggy, so the bug is most likely in the `Display`
// impl of `Self::Display`.
panic!("The implementation of Display for {} returned an error when it shouldn't", name)
})
}
/// Hints how much bytes to reserve when creating a `String`.
///
/// Implementors that know the number of produced bytes upfront should override this.
/// Defaults to 0.
///
// We prefix the name with `hex_` to avoid potential collision with other methods.
fn hex_reserve_suggestion(self) -> usize {
0
}
}
mod sealed {
/// Trait marking a shared reference.
pub trait IsRef: Copy {
}
impl<T: ?Sized> IsRef for &'_ T {
}
}
impl<'a> DisplayHex for &'a [u8] {
type Display = DisplayByteSlice<'a>;
#[inline]
fn display_hex(self, case: Case) -> Self::Display {
DisplayByteSlice {
bytes: self,
case,
}
}
#[inline]
fn hex_reserve_suggestion(self) -> usize {
// Since the string wouldn't fit into address space if this overflows (actually even for
// smaller amounts) it's better to panic right away. It should also give the optimizer
// better opportunities.
self.len().checked_mul(2).expect("the string wouldn't fit into address space")
}
}
/// Displays byte slice as hex.
///
/// Created by [`<&[u8] as DisplayHex>::display_hex`](DisplayHex::display_hex).
pub struct DisplayByteSlice<'a> {
bytes: &'a [u8],
case: Case,
}
impl<'a> fmt::Display for DisplayByteSlice<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut buf = [0u8; 1024];
let mut encoder = super::BufEncoder::new(&mut buf);
let mut chunks = self.bytes.chunks_exact(512);
for chunk in &mut chunks {
encoder.put_bytes(chunk, self.case);
f.write_str(encoder.as_str())?;
encoder.clear();
}
encoder.put_bytes(chunks.remainder(), self.case);
f.write_str(encoder.as_str())
}
}
/// Format known-length array as hex.
///
/// This supports all formatting options of formatter and may be faster than calling
/// `display_as_hex()` on an arbitrary `&[u8]`. Note that the implementation intentionally keeps
/// leading zeros even when not requested. This is designed to display values such as hashes and
/// keys and removing leading zeros would be confusing.
///
/// ## Parameters
///
/// * `$formatter` - a [`fmt::Formatter`].
/// * `$len` known length of `$bytes`, must be a const expression.
/// * `$bytes` - bytes to be encoded, most likely a reference to an array.
/// * `$case` - value of type [`Case`] determining whether to format as lower or upper case.
///
/// ## Panics
///
/// This macro panics if `$len` is not equal to `$bytes.len()`. It also fails to compile if `$len`
/// is more than half of `usize::MAX`.
#[macro_export]
macro_rules! fmt_hex_exact {
($formatter:expr, $len:expr, $bytes:expr, $case:expr) => {
{
// statically check $len
#[allow(deprecated)]
const _: () = [()][($len > usize::max_value() / 2) as usize];
assert_eq!($bytes.len(), $len);
let mut buf = [0u8; $len * 2];
$crate::hex::display::fmt_hex_exact_fn($formatter, (&mut buf).into(), $bytes, $case)
}
}
}
// Implementation detail of `write_hex_exact` macro to de-duplicate the code
#[doc(hidden)]
#[inline]
pub fn fmt_hex_exact_fn(f: &mut fmt::Formatter, buf: OutBytes<'_>, bytes: &[u8], case: Case) -> fmt::Result {
let mut encoder = BufEncoder::new(buf);
encoder.put_bytes(bytes, case);
f.pad_integral(true, "0x", encoder.as_str())
}
#[cfg(test)]
mod tests {
#[cfg(feature = "alloc")]
use super::*;
#[cfg(feature = "alloc")]
mod alloc {
use super::*;
fn check_encoding(bytes: &[u8]) {
use core::fmt::Write;
let s1 = bytes.to_lower_hex_string();
let mut s2 = String::with_capacity(bytes.len() * 2);
for b in bytes {
write!(s2, "{:02x}", b).unwrap();
}
assert_eq!(s1, s2);
}
#[test]
fn empty() {
check_encoding(b"");
}
#[test]
fn single() {
check_encoding(b"*");
}
#[test]
fn two() {
check_encoding(b"*x");
}
#[test]
fn just_below_boundary() {
check_encoding(&[42; 512]);
}
#[test]
fn just_above_boundary() {
check_encoding(&[42; 513]);
}
#[test]
fn just_above_double_boundary() {
check_encoding(&[42; 1025]);
}
#[test]
fn fmt_exact_macro() {
use crate::alloc::string::ToString;
struct Dummy([u8; 32]);
impl fmt::Display for Dummy {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt_hex_exact!(f, 32, &self.0, Case::Lower)
}
}
assert_eq!(Dummy([42; 32]).to_string(), "2a".repeat(32));
}
}
}

53
internals/src/hex/mod.rs Normal file
View File

@ -0,0 +1,53 @@
//! Helpers for encoding bytes as hex strings.
pub mod buf_encoder;
pub mod display;
pub use buf_encoder::BufEncoder;
/// Reexports of extension traits.
pub mod exts {
pub use super::display::DisplayHex;
}
/// Possible case of hex.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
pub enum Case {
/// Produce lower-case chars (`[0-9a-f]`).
///
/// This is the default.
Lower,
/// Produce upper-case chars (`[0-9A-F]`).
Upper,
}
impl Default for Case {
fn default() -> Self {
Case::Lower
}
}
impl Case {
/// Returns the encoding table.
///
/// The returned table may only contain displayable ASCII chars.
#[inline]
pub(crate) fn table(self) -> &'static [u8; 16] {
static LOWER: [u8; 16] = [b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'a', b'b', b'c', b'd', b'e', b'f'];
static UPPER: [u8; 16] = [b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'A', b'B', b'C', b'D', b'E', b'F'];
match self {
Case::Lower => &LOWER,
Case::Upper => &UPPER,
}
}
}
/// Encodes single byte as two ASCII chars using the given table.
///
/// The function guarantees only returning values from the provided table.
#[inline]
pub(crate) fn byte_to_hex(byte: u8, table: &[u8; 16]) -> [u8; 2] {
[table[usize::from(byte.wrapping_shr(4))], table[usize::from(byte & 0x0F)]]
}

View File

@ -7,7 +7,7 @@
//! [rust-bitcoin](https://github.com/rust-bitcoin) ecosystem.
//!
#![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
#![no_std]
// Experimental features we need.
#![cfg_attr(docsrs, feature(doc_cfg))]
// Coding conventions
@ -21,4 +21,17 @@
#![deny(missing_docs)]
#![deny(unused_must_use)]
#[cfg(feature = "alloc")]
extern crate alloc;
#[cfg(feature = "std")]
extern crate std;
pub mod error;
pub mod hex;
/// Mainly reexports based on features.
pub(crate) mod prelude {
#[cfg(feature = "alloc")]
pub(crate) use alloc::string::String;
}