Add awareness of uniqueness of words in wors lists
This is needed to have good guarantees for the Mnemonic::guess_language method which can therefore be renamed to Mnemonic::language_of.
This commit is contained in:
parent
6179d293ee
commit
32daa49834
|
@ -54,6 +54,30 @@ pub enum Language {
|
|||
}
|
||||
|
||||
impl Language {
|
||||
/// The list of supported languages.
|
||||
/// Language support is managed by compile features.
|
||||
pub fn all() -> &'static [Language] {
|
||||
&[
|
||||
Language::English,
|
||||
#[cfg(feature = "chinese-simplified")]
|
||||
Language::SimplifiedChinese,
|
||||
#[cfg(feature = "chinese-traditional")]
|
||||
Language::TraditionalChinese,
|
||||
#[cfg(feature = "czech")]
|
||||
Language::Czech,
|
||||
#[cfg(feature = "french")]
|
||||
Language::French,
|
||||
#[cfg(feature = "italian")]
|
||||
Language::Italian,
|
||||
#[cfg(feature = "japanese")]
|
||||
Language::Japanese,
|
||||
#[cfg(feature = "korean")]
|
||||
Language::Korean,
|
||||
#[cfg(feature = "spanish")]
|
||||
Language::Spanish,
|
||||
]
|
||||
}
|
||||
|
||||
/// The word list for this language.
|
||||
#[inline]
|
||||
pub(crate) fn word_list(self) -> &'static [&'static str; 2048] {
|
||||
|
@ -78,6 +102,31 @@ impl Language {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns true if all words in the list are guaranteed to
|
||||
/// only be in this list and not in any other.
|
||||
#[inline]
|
||||
pub(crate) fn unique_words(self) -> bool {
|
||||
match self {
|
||||
Language::English => false,
|
||||
#[cfg(feature = "chinese-simplified")]
|
||||
Language::SimplifiedChinese => false,
|
||||
#[cfg(feature = "chinese-traditional")]
|
||||
Language::TraditionalChinese => false,
|
||||
#[cfg(feature = "czech")]
|
||||
Language::Czech => true,
|
||||
#[cfg(feature = "french")]
|
||||
Language::French => false,
|
||||
#[cfg(feature = "italian")]
|
||||
Language::Italian => true,
|
||||
#[cfg(feature = "japanese")]
|
||||
Language::Japanese => true,
|
||||
#[cfg(feature = "korean")]
|
||||
Language::Korean => true,
|
||||
#[cfg(feature = "spanish")]
|
||||
Language::Spanish => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get words from the word list that start with the given prefix.
|
||||
pub fn words_by_prefix(self, prefix: &str) -> &[&'static str] {
|
||||
// The words in the word list are ordered lexicographically. This means
|
||||
|
@ -170,4 +219,36 @@ mod tests {
|
|||
let res = lang.words_by_prefix("woof");
|
||||
assert!(res.is_empty());
|
||||
}
|
||||
|
||||
#[cfg(all(
|
||||
feature = "chinese-simplified", feature = "chinese-traditional", feature = "czech",
|
||||
feature = "french", feature = "italian", feature = "japanese", feature = "korean",
|
||||
feature = "spanish"
|
||||
))]
|
||||
#[test]
|
||||
fn words_overlaps() {
|
||||
use std::collections::HashMap;
|
||||
|
||||
// We keep a map of all words and the languages they occur in.
|
||||
// Afterwards, we make sure that no word maps to multiple languages
|
||||
// if either of those is guaranteed to have unique words.
|
||||
let mut words: HashMap<&str, Vec<Language>> = HashMap::new();
|
||||
for lang in Language::all().iter() {
|
||||
for word in lang.word_list().iter() {
|
||||
words.entry(word).or_insert(Vec::new()).push(*lang);
|
||||
}
|
||||
}
|
||||
|
||||
let mut ok = true;
|
||||
for (word, langs) in words.into_iter() {
|
||||
if langs.len() == 1 {
|
||||
continue;
|
||||
}
|
||||
if langs.iter().any(|l| l.unique_words()) {
|
||||
println!("Word {} is not unique: {:?}", word, langs);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
assert!(ok);
|
||||
}
|
||||
}
|
||||
|
|
69
src/lib.rs
69
src/lib.rs
|
@ -58,6 +58,8 @@ pub enum Error {
|
|||
BadEntropyBitCount(usize),
|
||||
/// The mnemonic has an invalid checksum.
|
||||
InvalidChecksum,
|
||||
/// The word list can be interpreted as multiple languages.
|
||||
AmbiguousWordList(Vec<Language>),
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
|
@ -74,6 +76,7 @@ impl fmt::Display for Error {
|
|||
"entropy was not between 128-256 bits or not a multiple of 32 bits: {} bits", c,
|
||||
),
|
||||
Error::InvalidChecksum => write!(f, "the mnemonic has an invalid checksum"),
|
||||
Error::AmbiguousWordList(ref langs) => write!(f, "ambiguous word list: {:?}", langs),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -216,47 +219,50 @@ impl Mnemonic {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Guess the language of the mnemonic based on the first word.
|
||||
/// Determine the language of the mnemonic based on the first word.
|
||||
///
|
||||
/// This works as official word lists are made as such that a word never
|
||||
/// appears in two different word lists.
|
||||
pub fn guess_language(s: &str) -> Result<Language, Error> {
|
||||
let languages = [
|
||||
Language::English,
|
||||
#[cfg(feature = "chinese-simplified")]
|
||||
Language::SimplifiedChinese,
|
||||
#[cfg(feature = "chinese-traditional")]
|
||||
Language::TraditionalChinese,
|
||||
#[cfg(feature = "czech")]
|
||||
Language::Czech,
|
||||
#[cfg(feature = "french")]
|
||||
Language::French,
|
||||
#[cfg(feature = "italian")]
|
||||
Language::Italian,
|
||||
#[cfg(feature = "japanese")]
|
||||
Language::Japanese,
|
||||
#[cfg(feature = "korean")]
|
||||
Language::Korean,
|
||||
#[cfg(feature = "spanish")]
|
||||
Language::Spanish,
|
||||
];
|
||||
/// Some word lists don't guarantee that their words don't occur in other
|
||||
/// word lists. In the extremely unlikely case that a word list can be
|
||||
/// interpreted in multiple languages, an [Error::AmbiguousWordList] is
|
||||
/// returned, containing the possible languages.
|
||||
pub fn language_of(s: &str) -> Result<Language, Error> {
|
||||
// First we try wordlists that have guaranteed unique words.
|
||||
let first_word = s.split_whitespace().next().unwrap();
|
||||
if first_word.len() == 0 {
|
||||
return Err(Error::BadWordCount(0));
|
||||
}
|
||||
for language in &languages {
|
||||
for language in Language::all().iter().filter(|l| l.unique_words()) {
|
||||
if language.find_word(first_word).is_some() {
|
||||
return Ok(*language);
|
||||
}
|
||||
}
|
||||
Err(Error::UnknownWord(first_word.to_owned()))
|
||||
|
||||
// If that didn't work, we start with all possible languages
|
||||
// (those without unique words), and eliminate until there is
|
||||
// just one left.
|
||||
let mut langs: Vec<_> =
|
||||
Language::all().iter().filter(|l| !l.unique_words()).cloned().collect();
|
||||
for word in s.split_whitespace() {
|
||||
langs.retain(|l| l.find_word(word).is_some());
|
||||
|
||||
// If there is just one language left, return it.
|
||||
if langs.len() == 1 {
|
||||
return Ok(langs[0]);
|
||||
}
|
||||
|
||||
// If all languages were eliminated, it's an invalid word.
|
||||
if langs.is_empty() {
|
||||
return Err(Error::UnknownWord(word.to_owned()))
|
||||
}
|
||||
}
|
||||
Err(Error::AmbiguousWordList(langs))
|
||||
}
|
||||
|
||||
/// Parse a mnemonic and detect the language from the enabled languages.
|
||||
pub fn parse<'a, S: Into<Cow<'a, str>>>(s: S) -> Result<Mnemonic, Error> {
|
||||
let mut cow = s.into();
|
||||
Mnemonic::normalize_utf8_cow(&mut cow);
|
||||
let language = Mnemonic::guess_language(cow.as_ref())?;
|
||||
let language = Mnemonic::language_of(cow.as_ref())?;
|
||||
Mnemonic::validate_in(language, cow.as_ref())?;
|
||||
Ok(Mnemonic(cow.into_owned()))
|
||||
}
|
||||
|
@ -309,7 +315,7 @@ impl Mnemonic {
|
|||
// We unwrap errors here because this method can only be called on
|
||||
// values that were already previously validated.
|
||||
|
||||
let language = Mnemonic::guess_language(self.as_str()).unwrap();
|
||||
let language = Mnemonic::language_of(self.as_str()).unwrap();
|
||||
|
||||
// Preallocate enough space for the longest possible word list
|
||||
let mut entropy = Vec::with_capacity(33);
|
||||
|
@ -361,6 +367,15 @@ mod tests {
|
|||
|
||||
use bitcoin_hashes::hex::FromHex;
|
||||
|
||||
#[cfg(feature = "rand")]
|
||||
#[test]
|
||||
fn test_language_of() {
|
||||
for lang in Language::all() {
|
||||
let m = Mnemonic::generate_in(*lang, 24).unwrap();
|
||||
assert_eq!(*lang, Mnemonic::language_of(m.as_str()).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vectors_english() {
|
||||
// These vectors are tuples of
|
||||
|
|
Loading…
Reference in New Issue