Add awareness of uniqueness of words in wors lists

This is needed to have good guarantees for the
Mnemonic::guess_language method which can therefore
be renamed to Mnemonic::language_of.
This commit is contained in:
Steven Roose 2020-07-18 13:58:54 +02:00
parent 6179d293ee
commit 32daa49834
No known key found for this signature in database
GPG Key ID: 2F2A88D7F8D68E87
2 changed files with 123 additions and 27 deletions

View File

@ -54,6 +54,30 @@ pub enum Language {
}
impl Language {
/// The list of supported languages.
/// Language support is managed by compile features.
pub fn all() -> &'static [Language] {
&[
Language::English,
#[cfg(feature = "chinese-simplified")]
Language::SimplifiedChinese,
#[cfg(feature = "chinese-traditional")]
Language::TraditionalChinese,
#[cfg(feature = "czech")]
Language::Czech,
#[cfg(feature = "french")]
Language::French,
#[cfg(feature = "italian")]
Language::Italian,
#[cfg(feature = "japanese")]
Language::Japanese,
#[cfg(feature = "korean")]
Language::Korean,
#[cfg(feature = "spanish")]
Language::Spanish,
]
}
/// The word list for this language.
#[inline]
pub(crate) fn word_list(self) -> &'static [&'static str; 2048] {
@ -78,6 +102,31 @@ impl Language {
}
}
/// Returns true if all words in the list are guaranteed to
/// only be in this list and not in any other.
#[inline]
pub(crate) fn unique_words(self) -> bool {
match self {
Language::English => false,
#[cfg(feature = "chinese-simplified")]
Language::SimplifiedChinese => false,
#[cfg(feature = "chinese-traditional")]
Language::TraditionalChinese => false,
#[cfg(feature = "czech")]
Language::Czech => true,
#[cfg(feature = "french")]
Language::French => false,
#[cfg(feature = "italian")]
Language::Italian => true,
#[cfg(feature = "japanese")]
Language::Japanese => true,
#[cfg(feature = "korean")]
Language::Korean => true,
#[cfg(feature = "spanish")]
Language::Spanish => true,
}
}
/// Get words from the word list that start with the given prefix.
pub fn words_by_prefix(self, prefix: &str) -> &[&'static str] {
// The words in the word list are ordered lexicographically. This means
@ -170,4 +219,36 @@ mod tests {
let res = lang.words_by_prefix("woof");
assert!(res.is_empty());
}
#[cfg(all(
feature = "chinese-simplified", feature = "chinese-traditional", feature = "czech",
feature = "french", feature = "italian", feature = "japanese", feature = "korean",
feature = "spanish"
))]
#[test]
fn words_overlaps() {
use std::collections::HashMap;
// We keep a map of all words and the languages they occur in.
// Afterwards, we make sure that no word maps to multiple languages
// if either of those is guaranteed to have unique words.
let mut words: HashMap<&str, Vec<Language>> = HashMap::new();
for lang in Language::all().iter() {
for word in lang.word_list().iter() {
words.entry(word).or_insert(Vec::new()).push(*lang);
}
}
let mut ok = true;
for (word, langs) in words.into_iter() {
if langs.len() == 1 {
continue;
}
if langs.iter().any(|l| l.unique_words()) {
println!("Word {} is not unique: {:?}", word, langs);
ok = false;
}
}
assert!(ok);
}
}

View File

@ -58,6 +58,8 @@ pub enum Error {
BadEntropyBitCount(usize),
/// The mnemonic has an invalid checksum.
InvalidChecksum,
/// The word list can be interpreted as multiple languages.
AmbiguousWordList(Vec<Language>),
}
impl fmt::Display for Error {
@ -74,6 +76,7 @@ impl fmt::Display for Error {
"entropy was not between 128-256 bits or not a multiple of 32 bits: {} bits", c,
),
Error::InvalidChecksum => write!(f, "the mnemonic has an invalid checksum"),
Error::AmbiguousWordList(ref langs) => write!(f, "ambiguous word list: {:?}", langs),
}
}
}
@ -216,47 +219,50 @@ impl Mnemonic {
Ok(())
}
/// Guess the language of the mnemonic based on the first word.
/// Determine the language of the mnemonic based on the first word.
///
/// This works as official word lists are made as such that a word never
/// appears in two different word lists.
pub fn guess_language(s: &str) -> Result<Language, Error> {
let languages = [
Language::English,
#[cfg(feature = "chinese-simplified")]
Language::SimplifiedChinese,
#[cfg(feature = "chinese-traditional")]
Language::TraditionalChinese,
#[cfg(feature = "czech")]
Language::Czech,
#[cfg(feature = "french")]
Language::French,
#[cfg(feature = "italian")]
Language::Italian,
#[cfg(feature = "japanese")]
Language::Japanese,
#[cfg(feature = "korean")]
Language::Korean,
#[cfg(feature = "spanish")]
Language::Spanish,
];
/// Some word lists don't guarantee that their words don't occur in other
/// word lists. In the extremely unlikely case that a word list can be
/// interpreted in multiple languages, an [Error::AmbiguousWordList] is
/// returned, containing the possible languages.
pub fn language_of(s: &str) -> Result<Language, Error> {
// First we try wordlists that have guaranteed unique words.
let first_word = s.split_whitespace().next().unwrap();
if first_word.len() == 0 {
return Err(Error::BadWordCount(0));
}
for language in &languages {
for language in Language::all().iter().filter(|l| l.unique_words()) {
if language.find_word(first_word).is_some() {
return Ok(*language);
}
}
Err(Error::UnknownWord(first_word.to_owned()))
// If that didn't work, we start with all possible languages
// (those without unique words), and eliminate until there is
// just one left.
let mut langs: Vec<_> =
Language::all().iter().filter(|l| !l.unique_words()).cloned().collect();
for word in s.split_whitespace() {
langs.retain(|l| l.find_word(word).is_some());
// If there is just one language left, return it.
if langs.len() == 1 {
return Ok(langs[0]);
}
// If all languages were eliminated, it's an invalid word.
if langs.is_empty() {
return Err(Error::UnknownWord(word.to_owned()))
}
}
Err(Error::AmbiguousWordList(langs))
}
/// Parse a mnemonic and detect the language from the enabled languages.
pub fn parse<'a, S: Into<Cow<'a, str>>>(s: S) -> Result<Mnemonic, Error> {
let mut cow = s.into();
Mnemonic::normalize_utf8_cow(&mut cow);
let language = Mnemonic::guess_language(cow.as_ref())?;
let language = Mnemonic::language_of(cow.as_ref())?;
Mnemonic::validate_in(language, cow.as_ref())?;
Ok(Mnemonic(cow.into_owned()))
}
@ -309,7 +315,7 @@ impl Mnemonic {
// We unwrap errors here because this method can only be called on
// values that were already previously validated.
let language = Mnemonic::guess_language(self.as_str()).unwrap();
let language = Mnemonic::language_of(self.as_str()).unwrap();
// Preallocate enough space for the longest possible word list
let mut entropy = Vec::with_capacity(33);
@ -361,6 +367,15 @@ mod tests {
use bitcoin_hashes::hex::FromHex;
#[cfg(feature = "rand")]
#[test]
fn test_language_of() {
for lang in Language::all() {
let m = Mnemonic::generate_in(*lang, 24).unwrap();
assert_eq!(*lang, Mnemonic::language_of(m.as_str()).unwrap());
}
}
#[test]
fn test_vectors_english() {
// These vectors are tuples of