From 471ef18d52bcd77f8c4f13fb22a49a1cc7b4664a Mon Sep 17 00:00:00 2001 From: Christian Reitter Date: Sun, 12 Jan 2025 16:06:35 +0100 Subject: [PATCH] Refactor and add second variant, originally written by Heiko Schaefer --- Cargo.lock | 249 +++++++++++++++++- Cargo.toml | 16 +- .../bloom_filter_generator.rs} | 3 + src/bin/bloom_filter_generator_hashed.rs | 95 +++++++ 4 files changed, 361 insertions(+), 2 deletions(-) rename src/{main.rs => bin/bloom_filter_generator.rs} (91%) create mode 100644 src/bin/bloom_filter_generator_hashed.rs diff --git a/Cargo.lock b/Cargo.lock index d5cc21d..4e73a45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,21 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "anstream" @@ -51,18 +66,57 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "async-trait" +version = "0.1.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + [[package]] name = "bit-vec" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bloom_filter_gen" version = "0.1.0" dependencies = [ "bloomfilter", "clap", + "csv", + "serde", + "serde_derive", + "sha256", ] [[package]] @@ -82,6 +136,12 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "bytes" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" + [[package]] name = "cfg-if" version = "1.0.0" @@ -134,6 +194,66 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "cpufeatures" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -147,18 +267,36 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + [[package]] name = "js-sys" version = "0.3.76" @@ -181,12 +319,42 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" +dependencies = [ + "adler2", +] + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + [[package]] name = "proc-macro2" version = "1.0.92" @@ -205,6 +373,62 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "serde" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha256" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18278f6a914fa3070aa316493f7d2ddfb9ac86ebc06fa3b83bffda487e9065b0" +dependencies = [ + "async-trait", + "bytes", + "hex", + "sha2", + "tokio", +] + [[package]] name = "siphasher" version = "1.0.1" @@ -228,6 +452,23 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tokio" +version = "1.43.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" +dependencies = [ + "backtrace", + "bytes", + "pin-project-lite", +] + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-ident" version = "1.0.14" @@ -240,6 +481,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index a7e8369..fc322d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,10 @@ publish = false [dependencies] bloomfilter = "1" clap = { version = "4", features = ["derive"] } +sha256 = "1.5" +csv = "1.3.0" +serde = "1.0" +serde_derive = "1.0" [profile.release] # note that most of these are the default setting anyway @@ -27,4 +31,14 @@ codegen-units = 1 # default false # lto = 'thin' -lto = 'fat' \ No newline at end of file +lto = 'fat' + +[[bin]] +name = "bloom_filter_generator" +test = false +bench = false + +[[bin]] +name = "bloom_filter_generator_hashed" +test = false +bench = false \ No newline at end of file diff --git a/src/main.rs b/src/bin/bloom_filter_generator.rs similarity index 91% rename from src/main.rs rename to src/bin/bloom_filter_generator.rs index 0c39489..6e319d9 100644 --- a/src/main.rs +++ b/src/bin/bloom_filter_generator.rs @@ -8,12 +8,15 @@ use std::path::PathBuf; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Opts { + /// Expected: line-separated ASCII file with inputs #[arg(short, long, help = "Input file", default_value = "addresses.txt")] input_file: PathBuf, #[arg(short, long, help = "Output file", default_value = "bloom.dump")] output_file: PathBuf, + /// Default value comes from number of publicly known bitcoin addresses at the time + /// Adjust for your use case #[arg( long, help = "Bloom filter: number of items", diff --git a/src/bin/bloom_filter_generator_hashed.rs b/src/bin/bloom_filter_generator_hashed.rs new file mode 100644 index 0000000..064699f --- /dev/null +++ b/src/bin/bloom_filter_generator_hashed.rs @@ -0,0 +1,95 @@ +use bloomfilter::Bloom; +use clap::{arg, Parser}; +use serde_derive::Deserialize; +use std::error::Error; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Opts { + /// See CSV record type for format details + #[arg(short, long, help = "Input file (mnemonics csv)")] + input_file: PathBuf, + + #[arg(short, long, help = "Output file (bloom filter dump of sha256 hashes)")] + output_file: PathBuf, + + /// expected during initial use case: 3x 2^32 entries + /// consisting of the 128, 192 and 256 bit bx PRNG ranges with 32 bit complexity each + #[arg( + long, + help = "Bloom filter: number of items", + default_value_t = 12_884_901_888 + )] + num_items: usize, + + /// highly depends on use case + #[arg( + long, + help = "Bloom filter: acceptable rate of false positives", + default_value_t = 0.000_001 + )] + fp_rate: f64, +} + +#[derive(Debug, Deserialize)] +struct MnemonicsRecord { + _index: u32, + mnemonic: String, +} + +fn main() -> Result<(), Box> { + let opts: Opts = Opts::parse(); + + let mut bloom = Bloom::new_for_fp_rate(opts.num_items, opts.fp_rate); + + // Improvement idea: use higher than default capacity? + let rdr = csv::ReaderBuilder::new() + .has_headers(false) + .from_path(&opts.input_file) + .expect("Failed to open input_file"); + + let wtr = File::create(&opts.output_file).expect("error opening output file"); + let mut wtr = BufWriter::new(wtr); + + let mut count: usize = 0; + + rdr.into_deserialize() + .for_each(|result: Result| { + if count & 0b1_1111_1111_1111_1111_1111 == 0 { + println!("Read {} lines", count); + } + count += 1; + + match result { + Ok(record) => { + let val: String = sha256::digest(record.mnemonic); + + bloom.set(&val) + } + Err(err) => eprintln!("Error reading CSV record: {}", err), + } + }); + + // Dump bloom filter to file + println!("Serializing bloom filter to output file"); + + // - metadata + wtr.write_all(&bloom.number_of_bits().to_be_bytes())?; + wtr.write_all(&bloom.number_of_hash_functions().to_be_bytes())?; + wtr.write_all(&bloom.sip_keys()[0].0.to_be_bytes())?; + wtr.write_all(&bloom.sip_keys()[0].1.to_be_bytes())?; + wtr.write_all(&bloom.sip_keys()[1].0.to_be_bytes())?; + wtr.write_all(&bloom.sip_keys()[1].1.to_be_bytes())?; + + wtr.flush()?; + + // - bitmap + wtr.write_all(&bloom.bitmap())?; + + wtr.flush()?; + + Ok(()) +}