keyfork/crates/util/keyfork-mnemonic-util/src/lib.rs

//! Zero-dependency mnemonic encoding and decoding of data.
//!
//! Mnemonics can be used to safely encode data of 32, 48, and 64 bytes as a phrase:
//!
//! ```rust
//! use keyfork_mnemonic_util::Mnemonic;
//! let data = b"Hello, world! I am a mnemonic :)";
//! assert_eq!(data.len(), 32);
//! let mnemonic = Mnemonic::try_from_slice(data).unwrap();
//! println!("Our mnemonic is: {mnemonic}");
//! ```
//!
//! A mnemonic can also be parsed from a string:
//!
//! ```rust
//! use keyfork_mnemonic_util::Mnemonic;
//! use std::str::FromStr;
//!
//! let data = b"Hello, world! I am a mnemonic :)";
//! let words = "embody clock brand tattoo search desert saddle eternal
//!              goddess animal banner dolphin bitter mother loyal asset
//!              hover clock forward system normal mosquito trim credit";
//! let mnemonic = Mnemonic::from_str(words).unwrap();
//! assert_eq!(&data[..], mnemonic.as_bytes());
//! ```
//!
//! Mnemonics can also be used to store data of other lengths, but such functionality is not
//! verified to be safe:
//!
//! ```rust
//! use keyfork_mnemonic_util::Mnemonic;
//! let data = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
//! let mnemonic = unsafe { Mnemonic::from_raw_bytes(data.as_slice()) };
//! let mnemonic_text = mnemonic.to_string();
//! ```
//!
//! If given an invalid length, undefined behavior may follow, or code may panic.
//!
//! ```rust,should_panic
//! use keyfork_mnemonic_util::Mnemonic;
//! use std::str::FromStr;
//!
//! // NOTE: Data is of invalid length, 31
//! let data = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
//! let mnemonic = unsafe { Mnemonic::from_raw_bytes(data.as_slice()) };
//! let mnemonic_text = mnemonic.to_string();
//! // NOTE: panic happens here
//! let new_mnemonic = Mnemonic::from_str(&mnemonic_text).unwrap();
//! ```

use std::{error::Error, fmt::Display, marker::PhantomData, str::FromStr, sync::OnceLock};

use keyfork_bug::bug;

use hmac::Hmac;
use pbkdf2::pbkdf2;
use sha2::{Digest, Sha256, Sha512};

/// The error type representing a failure to create a [`Mnemonic`]. These errors only occur during
/// [`Mnemonic`] creation.
#[derive(Debug, Clone)]
pub enum MnemonicGenerationError {
    /// The amount of bits passed to a mnemonic must be divisible by 32.
    InvalidByteCount(usize),

    /// The length of a mnemonic in bits must be within the BIP-0039 range, and supported by the
    /// library. Currently, only 128, 192 (for testing purposes), and 256 are supported.
    InvalidByteLength(usize),

    /// Invalid length resulting from PBKDF2.
    InvalidPbkdf2Length,
}

impl Display for MnemonicGenerationError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            MnemonicGenerationError::InvalidByteCount(count) => {
                write!(f, "Invalid byte count: {count}, must be divisible by 8")
            }
            MnemonicGenerationError::InvalidByteLength(count) => {
                write!(f, "Invalid byte length: {count}, must be 128 or 256")
            }
            MnemonicGenerationError::InvalidPbkdf2Length => {
                f.write_str("Invalid length from PBKDF2")
            }
        }
    }
}

impl Error for MnemonicGenerationError {}

/// A trait representing a BIP-0039 wordlist, of 2048 words, with each word having a unique first
/// three letters.
pub trait Wordlist: std::fmt::Debug {
    /// Get a reference to a [`std::sync::OnceLock`] Self.
    fn get_singleton<'a>() -> &'a Self;

    /// Return a representation of the words in the wordlist as an array of [`str`].
    fn to_str_array(&self) -> [&str; 2048];
}

/// A wordlist for the English language, from the BIP-0039 dataset.
#[derive(Debug)]
pub struct English {
    words: [String; 2048],
}

static ENGLISH: OnceLock<English> = OnceLock::new();

impl Wordlist for English {
    fn get_singleton<'a>() -> &'a Self {
        ENGLISH.get_or_init(|| {
            let wordlist_file = include_str!("data/wordlist.txt");
            let mut words = wordlist_file.lines().skip(1).map(|x| x.trim().to_string());
            English {
                words: std::array::from_fn(|_| {
                    words.next().expect(bug!("wordlist {} should have 2048 words"))
                }),
            }
        })
    }

    fn to_str_array(&self) -> [&str; 2048] {
        std::array::from_fn(|i| self.words[i].as_str())
    }
}

struct AssertValidMnemonicSize<const N: usize>;

impl<const N: usize> AssertValidMnemonicSize<N> {
    const OK_CHUNKS: () = assert!(N % 4 == 0, "bytes must be a length divisible by 4");
    const OK_SIZE: () = assert!(N <= 1024, "bytes must be less-or-equal 1024");
}

/// A BIP-0039 mnemonic with reference to a [`Wordlist`].
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MnemonicBase<W: Wordlist> {
    data: Vec<u8>,
    marker: PhantomData<W>,
}

/// A default Mnemonic using the English language.
pub type Mnemonic = MnemonicBase<English>;

impl<W> Display for MnemonicBase<W>
where
    W: Wordlist,
{
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let wordlist = W::get_singleton();
        let words = wordlist.to_str_array();

        let mut iter = self
            .words()
            .into_iter()
            .filter_map(|word| words.get(word))
            .peekable();
        while let Some(word) = iter.next() {
            f.write_str(word)?;
            if iter.peek().is_some() {
                f.write_str(" ")?;
            }
        }
        Ok(())
    }
}

/// The error type representing a failure to parse a [`Mnemonic`]. These errors only occur during
/// [`Mnemonic`] creation.
#[derive(Debug, Clone)]
pub enum MnemonicFromStrError {
    /// The amount of words used to parse a mnemonic was not correct.
    InvalidWordCount(usize),

    /// One of the words used to generate the mnemonic was not found in the default wordlist.
    InvalidWord(usize),

    /// The checksum for the mnemonic did not match the given words.
    InvalidChecksum,
}

impl Display for MnemonicFromStrError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str("Mnemonic error: ")?;
        match self {
            MnemonicFromStrError::InvalidWordCount(count) => {
                write!(f, "Incorrect word count: {count}")
            }
            MnemonicFromStrError::InvalidWord(index) => {
                write!(f, "Unknown word at index: {index}")
            }
            MnemonicFromStrError::InvalidChecksum => {
                f.write_str("Checksum of data did not match expected value")
            }
        }
    }
}

impl Error for MnemonicFromStrError {}

impl<W> FromStr for MnemonicBase<W>
where
    W: Wordlist,
{
    type Err = MnemonicFromStrError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let wordlist = W::get_singleton();
        let wordlist_words = wordlist.to_str_array();
        let words: Vec<_> = s.split_whitespace().collect();
        let mut usize_words = vec![];
        let mut bits = vec![false; words.len() * 11];
        for (index, word) in words.iter().enumerate() {
            let word = wordlist_words
                .iter()
                .position(|w| w == word)
                .ok_or(MnemonicFromStrError::InvalidWord(index))?;
            usize_words.push(word);
            for bit in 0..11 {
                bits[index * 11 + bit] = (word & (1 << (10 - bit))) > 0;
            }
        }

        let mut checksum_bits = vec![false; bits.len() - (bits.len() * 32 / 33)];
        checksum_bits.copy_from_slice(&bits[bits.len() * 32 / 33..]);

        // remove checksum bits
        bits.truncate(bits.len() * 32 / 33);
        // bits.truncate(bits.len() - bits.len() % 32);

        let data: Vec<u8> = bits
            .chunks_exact(8)
            .map(|chunk| {
                let mut num = 0u8;
                for i in 0..8 {
                    num += u8::from(chunk[7 - i]) << i;
                }
                num
            })
            .collect();

        let mut hasher = Sha256::new();
        hasher.update(&data);
        let hash = hasher.finalize().to_vec();

        for (i, bit) in checksum_bits.iter().enumerate() {
            if !hash[i / 8] & (1 << (7 - (i % 8))) == u8::from(*bit) {
                return Err(MnemonicFromStrError::InvalidChecksum);
            }
        }

        Ok(MnemonicBase {
            data,
            marker: PhantomData,
        })
    }
}

impl<W> MnemonicBase<W>
where
    W: Wordlist,
{
    /// Generate a [`Mnemonic`] from the provided data and [`Wordlist`]. The data is expected to be
    /// of 128, 192, or 256 bits, as per BIP-0039.
    ///
    /// # Errors
    /// An error may be returned if the data is not within the expected lengths.
    ///
    /// # Examples
    /// ```rust
    /// use keyfork_mnemonic_util::Mnemonic;
    /// let data = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
    /// let mnemonic = Mnemonic::try_from_slice(data.as_slice()).unwrap();
    /// ```
    pub fn try_from_slice(bytes: &[u8]) -> Result<MnemonicBase<W>, MnemonicGenerationError> {
        let bit_count = bytes.len() * 8;

        if bit_count % 32 != 0 {
            return Err(MnemonicGenerationError::InvalidByteCount(bit_count));
        }
        // 192 supported for test suite
        if ![128, 192, 256].contains(&bit_count) {
            return Err(MnemonicGenerationError::InvalidByteLength(bit_count));
        }

        Ok( Self::from_raw_bytes(bytes) )
    }

    /// Generate a [`Mnemonic`] from the provided data and [`Wordlist`]. The data may be of a size
    /// of a factor of 4, up to 1024 bytes.
    ///
    /// ```rust
    /// use keyfork_mnemonic_util::Mnemonic;
    /// let data = b"hello world!";
    /// let mnemonic = Mnemonic::from_array(*data);
    /// ```
    ///
    /// If an invalid size is requested, the code will fail to compile:
    ///
    /// ```rust,compile_fail
    /// use keyfork_mnemonic_util::Mnemonic;
    /// let mnemonic = Mnemonic::from_array([0u8; 53]);
    /// ```
    ///
    /// ```rust,compile_fail
    /// use keyfork_mnemonic_util::Mnemonic;
    /// let mnemonic = Mnemonic::from_array([0u8; 1024 + 4]);
    /// ```
    pub fn from_array<const N: usize>(bytes: [u8; N]) -> MnemonicBase<W> {
        #[allow(clippy::let_unit_value)]
        {
            let () = AssertValidMnemonicSize::<N>::OK_CHUNKS;
            let () = AssertValidMnemonicSize::<N>::OK_SIZE;
        }
        Self::from_raw_bytes(&bytes)
    }

    /// Create a Mnemonic using an arbitrary length of given data. The length does not need to
    /// conform to BIP-0039 standards, but should be a multiple of 32 bits or 4 bytes.
    ///
    /// # Panics
    /// This function can potentially produce mnemonics that are not BIP-0039 compliant or can't
    /// properly be encoded as a mnemonic. It is assumed the caller asserts the byte count is `% 4
    /// == 0`. If the assumption is incorrect, code may panic. The
    /// [`MnemonicBase::from_array`] function may be used to generate entropy if the length of the
    /// data is known at compile-time.
    ///
    /// # Examples
    /// ```rust
    /// use keyfork_mnemonic_util::Mnemonic;
    /// let data = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
    /// let mnemonic = unsafe { Mnemonic::from_raw_bytes(data.as_slice()) };
    /// let mnemonic_text = mnemonic.to_string();
    /// ```
    ///
    /// If given an invalid length, undefined behavior may follow, or code may panic.
    ///
    /// ```rust,should_panic
    /// use keyfork_mnemonic_util::Mnemonic;
    /// use std::str::FromStr;
    ///
    /// // NOTE: Data is of invalid length, 31
    /// let data = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
    /// let mnemonic = unsafe { Mnemonic::from_raw_bytes(data.as_slice()) };
    /// ```
    pub fn from_raw_bytes(bytes: &[u8]) -> MnemonicBase<W> {
        assert!(bytes.len() % 4 == 0);
        assert!(bytes.len() <= 1024);
        MnemonicBase {
            data: bytes.to_vec(),
            marker: PhantomData,
        }
    }

    /// Create a Mnemonic using an arbitrary length of given data. The length does not need to
    /// conform to BIP-0039 standards, but should be a multiple of 32 bits or 4 bytes.
    ///
    /// # Safety
    ///
    /// This function can potentially produce mnemonics that are not BIP-0039 compliant or can't
    /// properly be encoded as a mnemonic. It is assumed the caller asserts the byte count is `% 4
    /// == 0`. If the assumption is incorrect, code may panic.
    #[deprecated = "use Mnemonic::from_raw_bytes"]
    pub unsafe fn from_raw_entropy(bytes: &[u8]) -> MnemonicBase<W> {
        MnemonicBase {
            data: bytes.to_vec(),
            marker: PhantomData,
        }
    }

    /// A view to internal representation of the decoded data.
    pub fn as_bytes(&self) -> &[u8] {
        &self.data
    }

    /// A clone of the internal representation of the decoded data.
    pub fn to_bytes(&self) -> Vec<u8> {
        self.data.to_vec()
    }

    /// Conver the Mnemonic into the internal representation of the decoded data.
    pub fn into_bytes(self) -> Vec<u8> {
        self.data
    }

    /// Clone the existing data.
    #[deprecated = "Use as_bytes(), to_bytes(), or into_bytes() instead"]
    pub fn entropy(&self) -> Vec<u8> {
        self.data.clone()
    }

    /// Create a BIP-0032 seed from the provided data and an optional passphrase.
    ///
    /// # Errors
    /// The method should not return an error.
    #[deprecated = "Use generate_seed() instead"]
    pub fn seed<'a>(
        &self,
        passphrase: impl Into<Option<&'a str>>,
    ) -> Result<Vec<u8>, MnemonicGenerationError> {
        Ok(self.generate_seed(passphrase).to_vec())
    }

    /// Create a BIP-0032 seed from the provided data and an optional passphrase.
    ///
    /// # Panics
    /// The function may panic if the HmacSha512 function returns an error. The only error the
    /// HmacSha512 function should return is an invalid length, which should not be possible.
    pub fn generate_seed<'a>(&self, passphrase: impl Into<Option<&'a str>>) -> [u8; 64] {
        let passphrase = passphrase.into();

        let mut seed = [0u8; 64];
        let mnemonic = self.to_string();
        let salt = ["mnemonic", passphrase.unwrap_or("")].join("");
        pbkdf2::<Hmac<Sha512>>(mnemonic.as_bytes(), salt.as_bytes(), 2048, &mut seed)
            .expect(bug!("HmacSha512 InvalidLength should be infallible"));
        seed
    }

    /// Encode the mnemonic into a list of integers 11 bits in length, matching the length of a
    /// BIP-0039 wordlist.
    pub fn words(&self) -> Vec<usize> {
        let bit_count = self.data.len() * 8;
        let mut bits = vec![false; bit_count + bit_count / 32];

        for byte_index in 0..bit_count / 8 {
            for bit_index in 0..8 {
                bits[byte_index * 8 + bit_index] =
                    (self.data[byte_index] & (1 << (7 - bit_index))) > 0;
            }
        }

        let mut hasher = Sha256::new();
        hasher.update(&self.data);
        let hash = hasher.finalize().to_vec();
        for check_bit in 0..bit_count / 32 {
            bits[bit_count + check_bit] = (hash[check_bit / 8] & (1 << (7 - (check_bit % 8)))) > 0;
        }

        // TODO: find a way to not have to collect to vec
        bits.chunks_exact(11)
            .peekable()
            .map(|chunk| {
                let mut num = 0usize;
                for i in 0..11 {
                    num += usize::from(chunk[10 - i]) << i;
                }
                num
            })
            .collect()
    }
}

impl<W> MnemonicBase<W>
where
    W: Wordlist,
{
    /// Generate a [`Mnemonic`] from the provided data and [`Wordlist`]. The data is expected to be
    /// of 128, 192, or 256 bits, as per BIP-0039.
    ///
    /// # Errors
    /// An error may be returned if the data is not within the expected lengths.
    #[deprecated = "use Mnemonic::try_from_slice"]
    pub fn from_bytes(bytes: &[u8]) -> Result<MnemonicBase<W>, MnemonicGenerationError> {
        MnemonicBase::try_from_slice(bytes)
    }

    /// Generate a [`Mnemonic`] from the provided data and [`Wordlist`]. The data is expected to be
    /// of 128, 192, or 256 bits, as per BIP-0039.
    ///
    /// # Errors
    /// An error may be returned if the data is not within the expected lengths.
    #[deprecated = "use Mnemonic::try_from_slice"]
    pub fn from_entropy(bytes: &[u8]) -> Result<MnemonicBase<W>, MnemonicGenerationError> {
        MnemonicBase::try_from_slice(bytes)
    }

    /// Generate a [`Mnemonic`] from the provided data and [`Wordlist`]. The data may be of a size
    /// of a factor of 4, up to 1024 bytes.
    ///
    #[deprecated = "Use Mnemonic::from_array"]
    pub fn from_nonstandard_bytes<const N: usize>(bytes: [u8; N]) -> MnemonicBase<W> {
        MnemonicBase::from_array(bytes)
    }
}

#[cfg(test)]
mod tests {
    use std::{collections::HashSet, fs::File, io::Read};

    use super::*;

    #[test]
    fn can_load_wordlist() {
        let _wordlist = English::get_singleton();
    }

    #[test]
    fn reproduces_its_own_seed() {
        let mut random_handle = File::open("/dev/random").unwrap();
        let entropy = &mut [0u8; 256 / 8];
        random_handle.read_exact(&mut entropy[..]).unwrap();
        let mnemonic = super::Mnemonic::try_from_slice(&entropy[..256 / 8]).unwrap();
        let new_entropy = mnemonic.as_bytes();
        assert_eq!(new_entropy, entropy);
    }

    #[test]
    fn conforms_to_trezor_tests() {
        let content = include_str!("data/vectors.json");
        let jsonobj: serde_json::Value = serde_json::from_str(content).unwrap();

        for test in jsonobj["english"].as_array().unwrap() {
            let [ref hex_, ref seed, ..] = test.as_array().unwrap()[..] else {
                panic!("bad test: {test}");
            };
            let hex = hex::decode(hex_.as_str().unwrap()).unwrap();

            let mnemonic = Mnemonic::try_from_slice(&hex).unwrap();

            assert_eq!(mnemonic.to_string(), seed.as_str().unwrap());
        }
    }

    #[test]
    fn matches_bip39_crate() {
        let mut random_handle = File::open("/dev/random").unwrap();
        let entropy = &mut [0u8; 256 / 8];
        random_handle.read_exact(&mut entropy[..]).unwrap();
        let my_mnemonic = Mnemonic::try_from_slice(&entropy[..256 / 8]).unwrap();
        let their_mnemonic = bip39::Mnemonic::from_entropy(&entropy[..256 / 8]).unwrap();
        assert_eq!(my_mnemonic.to_string(), their_mnemonic.to_string());
        assert_eq!(my_mnemonic.generate_seed(None), their_mnemonic.to_seed(""));
        assert_eq!(
            my_mnemonic.generate_seed("testing"),
            their_mnemonic.to_seed("testing")
        );
        assert_ne!(
            my_mnemonic.generate_seed("test1"),
            their_mnemonic.to_seed("test2")
        );
    }

    #[test]
    fn count_rate_of_duplicate_words() {
        let tests = 100_000;
        let mut count = 0.;
        let entropy = &mut [0u8; 256 / 8];
        let mut random = std::fs::File::open("/dev/urandom").unwrap();
        let mut hs = HashSet::<usize>::with_capacity(24);

        for _ in 0..tests {
            random.read_exact(&mut entropy[..]).unwrap();
            let mnemonic = Mnemonic::try_from_slice(&entropy[..256 / 8]).unwrap();
            let words = mnemonic.words();
            hs.clear();
            hs.extend(words);
            if hs.len() != 24 {
                count += 1.;
            }
        }

        // NOTE: Birthday problem math is: 0.126532
        // Set values to (about) 1 below, 1 above
        // Source: https://en.wikipedia.org/wiki/Birthday_problem
        let min = 11.5;
        let max = 13.5;
        assert!(
            count > f64::from(tests) * min / 100.,
            "{count} probability should be more than {min}%: {}",
            count / f64::from(tests)
        );
        assert!(
            count < f64::from(tests) * max / 100.,
            "{count} probability should be more than {max}%: {}",
            count / f64::from(tests)
        );
    }

    #[test]
    fn can_do_up_to_8192_bits() {
        let mut entropy = [0u8; 1024];
        let mut random = std::fs::File::open("/dev/urandom").unwrap();
        random.read_exact(&mut entropy[..]).unwrap();
        let mnemonic = Mnemonic::from_array(entropy);
        let words = mnemonic.words();
        assert_eq!(words.len(), 768);
    }

    #[test]
    #[should_panic]
    fn fails_over_8192_bits() {
        let entropy = &mut [0u8; 1024 + 4];
        let mut random = std::fs::File::open("/dev/urandom").unwrap();
        random.read_exact(&mut entropy[..]).unwrap();
        let _mnemonic = Mnemonic::from_raw_bytes(&entropy[..]);
    }

    #[test]
    #[should_panic]
    fn fails_over_invalid_size() {
        let entropy = &mut [0u8; 255];
        let mut random = std::fs::File::open("/dev/urandom").unwrap();
        random.read_exact(&mut entropy[..]).unwrap();
        let _mnemonic = Mnemonic::from_raw_bytes(&entropy[..]);
    }
}