Bloom filter code moved to separate repository
This commit is contained in:
parent
6028c4794f
commit
58367cbe7f
File diff suppressed because it is too large
Load Diff
|
@ -1,14 +0,0 @@
|
||||||
[package]
|
|
||||||
name = "bloom-filter-generator"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition = "2021"
|
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
hex = "0.4.3"
|
|
||||||
serde = "1.0"
|
|
||||||
serde_derive = "1.0"
|
|
||||||
rayon = "1.7.0"
|
|
||||||
clap = {version = "4.0.32", features = ["derive"]}
|
|
||||||
bloomfilter = "1"
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
# bloom-filter-generator
|
||||||
|
|
||||||
|
This code moved to https://git.distrust.co/milksad/rust-bloom-filter-generator.
|
||||||
|
|
||||||
|
## Usage note
|
||||||
|
|
||||||
|
Make sure to use a suitable parameter value for bloom filter size that corresponds to the input data set.
|
||||||
|
|
||||||
|
Experiment with the false positive factor parameter for memory footprint vs. accuracy trade-off.
|
|
@ -1,75 +0,0 @@
|
||||||
use std::error::Error;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{BufWriter, BufReader, BufRead, Write};
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use bloomfilter::Bloom;
|
|
||||||
use clap::{arg, Parser};
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
|
||||||
#[command(author, version, about, long_about = None)]
|
|
||||||
struct Opts {
|
|
||||||
#[arg(short, long, help = "Input file (sha256 hashes csv)")]
|
|
||||||
input_file: PathBuf,
|
|
||||||
|
|
||||||
#[arg(short, long, help = "Output file (bloom filter dump of sha256 hashes)")]
|
|
||||||
output_file: PathBuf,
|
|
||||||
|
|
||||||
#[arg(
|
|
||||||
long,
|
|
||||||
help = "Bloom filter: number of items",
|
|
||||||
default_value_t = 13_194_396_000
|
|
||||||
)]
|
|
||||||
num_items: usize,
|
|
||||||
|
|
||||||
#[arg(
|
|
||||||
long,
|
|
||||||
help = "Bloom filter: wanted rate of false positives",
|
|
||||||
default_value_t = 0.000_001
|
|
||||||
)]
|
|
||||||
fp_rate: f64,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn Error>> {
|
|
||||||
let opts: Opts = Opts::parse();
|
|
||||||
|
|
||||||
let mut bloom = Bloom::new_for_fp_rate(opts.num_items, opts.fp_rate);
|
|
||||||
|
|
||||||
let reader = BufReader::new(File::open(&opts.input_file)?).lines();
|
|
||||||
|
|
||||||
let writer = File::create(&opts.output_file)
|
|
||||||
.expect("error opening output file");
|
|
||||||
let mut writer = BufWriter::new(writer);
|
|
||||||
|
|
||||||
for (count, line) in reader.enumerate() {
|
|
||||||
if count & 0b1_1111_1111_1111_1111_1111 == 0 {
|
|
||||||
println!("Read {} lines", count);
|
|
||||||
}
|
|
||||||
|
|
||||||
match line {
|
|
||||||
Ok(hash) => {
|
|
||||||
bloom.set(&hash)
|
|
||||||
}
|
|
||||||
Err(err) => eprintln!("Error reading line record: {}", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dump bloom filter to file
|
|
||||||
println!("Serializing bloom filter to output file");
|
|
||||||
|
|
||||||
// - metadata
|
|
||||||
writer.write_all(&bloom.number_of_bits().to_be_bytes())?;
|
|
||||||
writer.write_all(&bloom.number_of_hash_functions().to_be_bytes())?;
|
|
||||||
writer.write_all(&bloom.sip_keys()[0].0.to_be_bytes())?;
|
|
||||||
writer.write_all(&bloom.sip_keys()[0].1.to_be_bytes())?;
|
|
||||||
writer.write_all(&bloom.sip_keys()[1].0.to_be_bytes())?;
|
|
||||||
writer.write_all(&bloom.sip_keys()[1].1.to_be_bytes())?;
|
|
||||||
|
|
||||||
writer.flush()?;
|
|
||||||
|
|
||||||
// - bitmap
|
|
||||||
writer.write_all(&bloom.bitmap())?;
|
|
||||||
|
|
||||||
writer.flush()?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
Loading…
Reference in New Issue