From 32daa498343e534be1ddee69e90e288108dc3ba6 Mon Sep 17 00:00:00 2001 From: Steven Roose Date: Sat, 18 Jul 2020 13:58:54 +0200 Subject: [PATCH] Add awareness of uniqueness of words in wors lists This is needed to have good guarantees for the Mnemonic::guess_language method which can therefore be renamed to Mnemonic::language_of. --- src/language/mod.rs | 81 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 69 +++++++++++++++++++++++--------------- 2 files changed, 123 insertions(+), 27 deletions(-) diff --git a/src/language/mod.rs b/src/language/mod.rs index 1ff9cd5..e373716 100644 --- a/src/language/mod.rs +++ b/src/language/mod.rs @@ -54,6 +54,30 @@ pub enum Language { } impl Language { + /// The list of supported languages. + /// Language support is managed by compile features. + pub fn all() -> &'static [Language] { + &[ + Language::English, + #[cfg(feature = "chinese-simplified")] + Language::SimplifiedChinese, + #[cfg(feature = "chinese-traditional")] + Language::TraditionalChinese, + #[cfg(feature = "czech")] + Language::Czech, + #[cfg(feature = "french")] + Language::French, + #[cfg(feature = "italian")] + Language::Italian, + #[cfg(feature = "japanese")] + Language::Japanese, + #[cfg(feature = "korean")] + Language::Korean, + #[cfg(feature = "spanish")] + Language::Spanish, + ] + } + /// The word list for this language. #[inline] pub(crate) fn word_list(self) -> &'static [&'static str; 2048] { @@ -78,6 +102,31 @@ impl Language { } } + /// Returns true if all words in the list are guaranteed to + /// only be in this list and not in any other. + #[inline] + pub(crate) fn unique_words(self) -> bool { + match self { + Language::English => false, + #[cfg(feature = "chinese-simplified")] + Language::SimplifiedChinese => false, + #[cfg(feature = "chinese-traditional")] + Language::TraditionalChinese => false, + #[cfg(feature = "czech")] + Language::Czech => true, + #[cfg(feature = "french")] + Language::French => false, + #[cfg(feature = "italian")] + Language::Italian => true, + #[cfg(feature = "japanese")] + Language::Japanese => true, + #[cfg(feature = "korean")] + Language::Korean => true, + #[cfg(feature = "spanish")] + Language::Spanish => true, + } + } + /// Get words from the word list that start with the given prefix. pub fn words_by_prefix(self, prefix: &str) -> &[&'static str] { // The words in the word list are ordered lexicographically. This means @@ -170,4 +219,36 @@ mod tests { let res = lang.words_by_prefix("woof"); assert!(res.is_empty()); } + + #[cfg(all( + feature = "chinese-simplified", feature = "chinese-traditional", feature = "czech", + feature = "french", feature = "italian", feature = "japanese", feature = "korean", + feature = "spanish" + ))] + #[test] + fn words_overlaps() { + use std::collections::HashMap; + + // We keep a map of all words and the languages they occur in. + // Afterwards, we make sure that no word maps to multiple languages + // if either of those is guaranteed to have unique words. + let mut words: HashMap<&str, Vec> = HashMap::new(); + for lang in Language::all().iter() { + for word in lang.word_list().iter() { + words.entry(word).or_insert(Vec::new()).push(*lang); + } + } + + let mut ok = true; + for (word, langs) in words.into_iter() { + if langs.len() == 1 { + continue; + } + if langs.iter().any(|l| l.unique_words()) { + println!("Word {} is not unique: {:?}", word, langs); + ok = false; + } + } + assert!(ok); + } } diff --git a/src/lib.rs b/src/lib.rs index 997a27d..40ad59d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,6 +58,8 @@ pub enum Error { BadEntropyBitCount(usize), /// The mnemonic has an invalid checksum. InvalidChecksum, + /// The word list can be interpreted as multiple languages. + AmbiguousWordList(Vec), } impl fmt::Display for Error { @@ -74,6 +76,7 @@ impl fmt::Display for Error { "entropy was not between 128-256 bits or not a multiple of 32 bits: {} bits", c, ), Error::InvalidChecksum => write!(f, "the mnemonic has an invalid checksum"), + Error::AmbiguousWordList(ref langs) => write!(f, "ambiguous word list: {:?}", langs), } } } @@ -216,47 +219,50 @@ impl Mnemonic { Ok(()) } - /// Guess the language of the mnemonic based on the first word. + /// Determine the language of the mnemonic based on the first word. /// - /// This works as official word lists are made as such that a word never - /// appears in two different word lists. - pub fn guess_language(s: &str) -> Result { - let languages = [ - Language::English, - #[cfg(feature = "chinese-simplified")] - Language::SimplifiedChinese, - #[cfg(feature = "chinese-traditional")] - Language::TraditionalChinese, - #[cfg(feature = "czech")] - Language::Czech, - #[cfg(feature = "french")] - Language::French, - #[cfg(feature = "italian")] - Language::Italian, - #[cfg(feature = "japanese")] - Language::Japanese, - #[cfg(feature = "korean")] - Language::Korean, - #[cfg(feature = "spanish")] - Language::Spanish, - ]; + /// Some word lists don't guarantee that their words don't occur in other + /// word lists. In the extremely unlikely case that a word list can be + /// interpreted in multiple languages, an [Error::AmbiguousWordList] is + /// returned, containing the possible languages. + pub fn language_of(s: &str) -> Result { + // First we try wordlists that have guaranteed unique words. let first_word = s.split_whitespace().next().unwrap(); if first_word.len() == 0 { return Err(Error::BadWordCount(0)); } - for language in &languages { + for language in Language::all().iter().filter(|l| l.unique_words()) { if language.find_word(first_word).is_some() { return Ok(*language); } } - Err(Error::UnknownWord(first_word.to_owned())) + + // If that didn't work, we start with all possible languages + // (those without unique words), and eliminate until there is + // just one left. + let mut langs: Vec<_> = + Language::all().iter().filter(|l| !l.unique_words()).cloned().collect(); + for word in s.split_whitespace() { + langs.retain(|l| l.find_word(word).is_some()); + + // If there is just one language left, return it. + if langs.len() == 1 { + return Ok(langs[0]); + } + + // If all languages were eliminated, it's an invalid word. + if langs.is_empty() { + return Err(Error::UnknownWord(word.to_owned())) + } + } + Err(Error::AmbiguousWordList(langs)) } /// Parse a mnemonic and detect the language from the enabled languages. pub fn parse<'a, S: Into>>(s: S) -> Result { let mut cow = s.into(); Mnemonic::normalize_utf8_cow(&mut cow); - let language = Mnemonic::guess_language(cow.as_ref())?; + let language = Mnemonic::language_of(cow.as_ref())?; Mnemonic::validate_in(language, cow.as_ref())?; Ok(Mnemonic(cow.into_owned())) } @@ -309,7 +315,7 @@ impl Mnemonic { // We unwrap errors here because this method can only be called on // values that were already previously validated. - let language = Mnemonic::guess_language(self.as_str()).unwrap(); + let language = Mnemonic::language_of(self.as_str()).unwrap(); // Preallocate enough space for the longest possible word list let mut entropy = Vec::with_capacity(33); @@ -361,6 +367,15 @@ mod tests { use bitcoin_hashes::hex::FromHex; + #[cfg(feature = "rand")] + #[test] + fn test_language_of() { + for lang in Language::all() { + let m = Mnemonic::generate_in(*lang, 24).unwrap(); + assert_eq!(*lang, Mnemonic::language_of(m.as_str()).unwrap()); + } + } + #[test] fn test_vectors_english() { // These vectors are tuples of