Add awareness of uniqueness of words in wors lists

This is needed to have good guarantees for the Mnemonic::guess_language method which can therefore be renamed to Mnemonic::language_of.
2020-07-18 13:58:54 +02:00 · 2020-07-18 13:58:54 +02:00 · 32daa49834
parent 6179d293ee
commit 32daa49834
2 changed files with 123 additions and 27 deletions
--- a/src/language/mod.rs
+++ b/src/language/mod.rs
@ -54,6 +54,30 @@ pub enum Language {
 }

 impl Language {
+	/// The list of supported languages.
+	/// Language support is managed by compile features.
+	pub fn all() -> &'static [Language] {
+		&[
+			Language::English,
+			#[cfg(feature = "chinese-simplified")]
+			Language::SimplifiedChinese,
+			#[cfg(feature = "chinese-traditional")]
+			Language::TraditionalChinese,
+			#[cfg(feature = "czech")]
+			Language::Czech,
+			#[cfg(feature = "french")]
+			Language::French,
+			#[cfg(feature = "italian")]
+			Language::Italian,
+			#[cfg(feature = "japanese")]
+			Language::Japanese,
+			#[cfg(feature = "korean")]
+			Language::Korean,
+			#[cfg(feature = "spanish")]
+			Language::Spanish,
+		]
+	}
+
 	/// The word list for this language.
 	#[inline]
 	pub(crate) fn word_list(self) -> &'static [&'static str; 2048] {
@ -78,6 +102,31 @@ impl Language {
 		}
 	}

+	/// Returns true if all words in the list are guaranteed to
+	/// only be in this list and not in any other.
+	#[inline]
+	pub(crate) fn unique_words(self) -> bool {
+		match self {
+			Language::English => false,
+			#[cfg(feature = "chinese-simplified")]
+			Language::SimplifiedChinese => false,
+			#[cfg(feature = "chinese-traditional")]
+			Language::TraditionalChinese => false,
+			#[cfg(feature = "czech")]
+			Language::Czech => true,
+			#[cfg(feature = "french")]
+			Language::French => false,
+			#[cfg(feature = "italian")]
+			Language::Italian => true,
+			#[cfg(feature = "japanese")]
+			Language::Japanese => true,
+			#[cfg(feature = "korean")]
+			Language::Korean => true,
+			#[cfg(feature = "spanish")]
+			Language::Spanish => true,
+		}
+	}
+
 	/// Get words from the word list that start with the given prefix.
 	pub fn words_by_prefix(self, prefix: &str) -> &[&'static str] {
 		// The words in the word list are ordered lexicographically. This means
@ -170,4 +219,36 @@ mod tests {
 		let res = lang.words_by_prefix("woof");
 		assert!(res.is_empty());
 	}
+
+	#[cfg(all(
+		feature = "chinese-simplified", feature = "chinese-traditional", feature = "czech",
+		feature = "french", feature = "italian", feature = "japanese", feature = "korean",
+		feature = "spanish"
+	))]
+    #[test]
+    fn words_overlaps() {
+		use std::collections::HashMap;
+
+		// We keep a map of all words and the languages they occur in.
+		// Afterwards, we make sure that no word maps to multiple languages
+		// if either of those is guaranteed to have unique words.
+        let mut words: HashMap<&str, Vec<Language>> = HashMap::new();
+		for lang in Language::all().iter() {
+			for word in lang.word_list().iter() {
+				words.entry(word).or_insert(Vec::new()).push(*lang);
+			}
+		}
+
+		let mut ok = true;
+		for (word, langs) in words.into_iter() {
+			if langs.len() == 1 {
+				continue;
+			}
+			if langs.iter().any(|l| l.unique_words()) {
+				println!("Word {} is not unique: {:?}", word, langs);
+				ok = false;
+			}
+		}
+		assert!(ok);
+    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -58,6 +58,8 @@ pub enum Error {
 	BadEntropyBitCount(usize),
 	/// The mnemonic has an invalid checksum.
 	InvalidChecksum,
+	/// The word list can be interpreted as multiple languages.
+	AmbiguousWordList(Vec<Language>),
 }

 impl fmt::Display for Error {
@ -74,6 +76,7 @@ impl fmt::Display for Error {
 				"entropy was not between 128-256 bits or not a multiple of 32 bits: {} bits", c,
 			),
 			Error::InvalidChecksum => write!(f, "the mnemonic has an invalid checksum"),
+			Error::AmbiguousWordList(ref langs) => write!(f, "ambiguous word list: {:?}", langs),
 		}
 	}
 }
@ -216,47 +219,50 @@ impl Mnemonic {
 		Ok(())
 	}

-	/// Guess the language of the mnemonic based on the first word.
+	/// Determine the language of the mnemonic based on the first word.
 	///
-	/// This works as official word lists are made as such that a word never
-	/// appears in two different word lists.
-	pub fn guess_language(s: &str) -> Result<Language, Error> {
-		let languages = [
-			Language::English,
-			#[cfg(feature = "chinese-simplified")]
-			Language::SimplifiedChinese,
-			#[cfg(feature = "chinese-traditional")]
-			Language::TraditionalChinese,
-			#[cfg(feature = "czech")]
-			Language::Czech,
-			#[cfg(feature = "french")]
-			Language::French,
-			#[cfg(feature = "italian")]
-			Language::Italian,
-			#[cfg(feature = "japanese")]
-			Language::Japanese,
-			#[cfg(feature = "korean")]
-			Language::Korean,
-			#[cfg(feature = "spanish")]
-			Language::Spanish,
-		];
+	/// Some word lists don't guarantee that their words don't occur in other
+	/// word lists. In the extremely unlikely case that a word list can be
+	/// interpreted in multiple languages, an [Error::AmbiguousWordList] is
+	/// returned, containing the possible languages.
+	pub fn language_of(s: &str) -> Result<Language, Error> {
+		// First we try wordlists that have guaranteed unique words.
 		let first_word = s.split_whitespace().next().unwrap();
 		if first_word.len() == 0 {
 			return Err(Error::BadWordCount(0));
 		}
-		for language in &languages {
+		for language in Language::all().iter().filter(|l| l.unique_words()) {
 			if language.find_word(first_word).is_some() {
 				return Ok(*language);
 			}
 		}
-		Err(Error::UnknownWord(first_word.to_owned()))
+
+		// If that didn't work, we start with all possible languages
+		// (those without unique words), and eliminate until there is
+		// just one left.
+		let mut langs: Vec<_> =
+			Language::all().iter().filter(|l| !l.unique_words()).cloned().collect();
+		for word in s.split_whitespace() {
+			langs.retain(|l| l.find_word(word).is_some());
+
+			// If there is just one language left, return it.
+			if langs.len() == 1 {
+				return Ok(langs[0]);
+			}
+
+			// If all languages were eliminated, it's an invalid word.
+			if langs.is_empty() {
+				return Err(Error::UnknownWord(word.to_owned()))
+			}
+		}
+		Err(Error::AmbiguousWordList(langs))
 	}

 	/// Parse a mnemonic and detect the language from the enabled languages.
 	pub fn parse<'a, S: Into<Cow<'a, str>>>(s: S) -> Result<Mnemonic, Error> {
 		let mut cow = s.into();
 		Mnemonic::normalize_utf8_cow(&mut cow);
-		let language = Mnemonic::guess_language(cow.as_ref())?;
+		let language = Mnemonic::language_of(cow.as_ref())?;
 		Mnemonic::validate_in(language, cow.as_ref())?;
 		Ok(Mnemonic(cow.into_owned()))
 	}
@ -309,7 +315,7 @@ impl Mnemonic {
 		// We unwrap errors here because this method can only be called on
 		// values that were already previously validated.

-		let language = Mnemonic::guess_language(self.as_str()).unwrap();
+		let language = Mnemonic::language_of(self.as_str()).unwrap();

 		// Preallocate enough space for the longest possible word list
 		let mut entropy = Vec::with_capacity(33);
@ -361,6 +367,15 @@ mod tests {

 	use bitcoin_hashes::hex::FromHex;

+	#[cfg(feature = "rand")]
+	#[test]
+	fn test_language_of() {
+		for lang in Language::all() {
+			let m = Mnemonic::generate_in(*lang, 24).unwrap();
+			assert_eq!(*lang, Mnemonic::language_of(m.as_str()).unwrap());
+		}
+	}
+
 	#[test]
 	fn test_vectors_english() {
 		// These vectors are tuples of