bloom-filter-generator: initial commit using code from milk-sad
This commit is contained in:
parent
12fb704461
commit
496df4b353
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,14 @@
|
||||||
|
[package]
|
||||||
|
name = "bloom-filter-generator"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
hex = "0.4.3"
|
||||||
|
serde = "1.0"
|
||||||
|
serde_derive = "1.0"
|
||||||
|
rayon = "1.7.0"
|
||||||
|
clap = {version = "4.0.32", features = ["derive"]}
|
||||||
|
bloomfilter = "1"
|
|
@ -0,0 +1,75 @@
|
||||||
|
use std::error::Error;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufWriter, BufReader, BufRead, Write};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use bloomfilter::Bloom;
|
||||||
|
use clap::{arg, Parser};
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[command(author, version, about, long_about = None)]
|
||||||
|
struct Opts {
|
||||||
|
#[arg(short, long, help = "Input file (sha256 hashes csv)")]
|
||||||
|
input_file: PathBuf,
|
||||||
|
|
||||||
|
#[arg(short, long, help = "Output file (bloom filter dump of sha256 hashes)")]
|
||||||
|
output_file: PathBuf,
|
||||||
|
|
||||||
|
#[arg(
|
||||||
|
long,
|
||||||
|
help = "Bloom filter: number of items",
|
||||||
|
default_value_t = 13_194_396_000
|
||||||
|
)]
|
||||||
|
num_items: usize,
|
||||||
|
|
||||||
|
#[arg(
|
||||||
|
long,
|
||||||
|
help = "Bloom filter: wanted rate of false positives",
|
||||||
|
default_value_t = 0.000_001
|
||||||
|
)]
|
||||||
|
fp_rate: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let opts: Opts = Opts::parse();
|
||||||
|
|
||||||
|
let mut bloom = Bloom::new_for_fp_rate(opts.num_items, opts.fp_rate);
|
||||||
|
|
||||||
|
let reader = BufReader::new(File::open(&opts.input_file)?).lines();
|
||||||
|
|
||||||
|
let writer = File::create(&opts.output_file)
|
||||||
|
.expect("error opening output file");
|
||||||
|
let mut writer = BufWriter::new(writer);
|
||||||
|
|
||||||
|
for (count, line) in reader.enumerate() {
|
||||||
|
if count & 0b1_1111_1111_1111_1111_1111 == 0 {
|
||||||
|
println!("Read {} lines", count);
|
||||||
|
}
|
||||||
|
|
||||||
|
match line {
|
||||||
|
Ok(hash) => {
|
||||||
|
bloom.set(&hash)
|
||||||
|
}
|
||||||
|
Err(err) => eprintln!("Error reading line record: {}", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump bloom filter to file
|
||||||
|
println!("Serializing bloom filter to output file");
|
||||||
|
|
||||||
|
// - metadata
|
||||||
|
writer.write_all(&bloom.number_of_bits().to_be_bytes())?;
|
||||||
|
writer.write_all(&bloom.number_of_hash_functions().to_be_bytes())?;
|
||||||
|
writer.write_all(&bloom.sip_keys()[0].0.to_be_bytes())?;
|
||||||
|
writer.write_all(&bloom.sip_keys()[0].1.to_be_bytes())?;
|
||||||
|
writer.write_all(&bloom.sip_keys()[1].0.to_be_bytes())?;
|
||||||
|
writer.write_all(&bloom.sip_keys()[1].1.to_be_bytes())?;
|
||||||
|
|
||||||
|
writer.flush()?;
|
||||||
|
|
||||||
|
// - bitmap
|
||||||
|
writer.write_all(&bloom.bitmap())?;
|
||||||
|
|
||||||
|
writer.flush()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Loading…
Reference in New Issue