Bloom filter code moved to separate repository
This commit is contained in:
parent
6028c4794f
commit
58367cbe7f
File diff suppressed because it is too large
Load Diff
|
@ -1,14 +0,0 @@
|
|||
[package]
|
||||
name = "bloom-filter-generator"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
hex = "0.4.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
rayon = "1.7.0"
|
||||
clap = {version = "4.0.32", features = ["derive"]}
|
||||
bloomfilter = "1"
|
|
@ -0,0 +1,9 @@
|
|||
# bloom-filter-generator
|
||||
|
||||
This code moved to https://git.distrust.co/milksad/rust-bloom-filter-generator.
|
||||
|
||||
## Usage note
|
||||
|
||||
Make sure to use a suitable parameter value for bloom filter size that corresponds to the input data set.
|
||||
|
||||
Experiment with the false positive factor parameter for memory footprint vs. accuracy trade-off.
|
|
@ -1,75 +0,0 @@
|
|||
use std::error::Error;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, BufReader, BufRead, Write};
|
||||
use std::path::PathBuf;
|
||||
use bloomfilter::Bloom;
|
||||
use clap::{arg, Parser};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Opts {
|
||||
#[arg(short, long, help = "Input file (sha256 hashes csv)")]
|
||||
input_file: PathBuf,
|
||||
|
||||
#[arg(short, long, help = "Output file (bloom filter dump of sha256 hashes)")]
|
||||
output_file: PathBuf,
|
||||
|
||||
#[arg(
|
||||
long,
|
||||
help = "Bloom filter: number of items",
|
||||
default_value_t = 13_194_396_000
|
||||
)]
|
||||
num_items: usize,
|
||||
|
||||
#[arg(
|
||||
long,
|
||||
help = "Bloom filter: wanted rate of false positives",
|
||||
default_value_t = 0.000_001
|
||||
)]
|
||||
fp_rate: f64,
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let opts: Opts = Opts::parse();
|
||||
|
||||
let mut bloom = Bloom::new_for_fp_rate(opts.num_items, opts.fp_rate);
|
||||
|
||||
let reader = BufReader::new(File::open(&opts.input_file)?).lines();
|
||||
|
||||
let writer = File::create(&opts.output_file)
|
||||
.expect("error opening output file");
|
||||
let mut writer = BufWriter::new(writer);
|
||||
|
||||
for (count, line) in reader.enumerate() {
|
||||
if count & 0b1_1111_1111_1111_1111_1111 == 0 {
|
||||
println!("Read {} lines", count);
|
||||
}
|
||||
|
||||
match line {
|
||||
Ok(hash) => {
|
||||
bloom.set(&hash)
|
||||
}
|
||||
Err(err) => eprintln!("Error reading line record: {}", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Dump bloom filter to file
|
||||
println!("Serializing bloom filter to output file");
|
||||
|
||||
// - metadata
|
||||
writer.write_all(&bloom.number_of_bits().to_be_bytes())?;
|
||||
writer.write_all(&bloom.number_of_hash_functions().to_be_bytes())?;
|
||||
writer.write_all(&bloom.sip_keys()[0].0.to_be_bytes())?;
|
||||
writer.write_all(&bloom.sip_keys()[0].1.to_be_bytes())?;
|
||||
writer.write_all(&bloom.sip_keys()[1].0.to_be_bytes())?;
|
||||
writer.write_all(&bloom.sip_keys()[1].1.to_be_bytes())?;
|
||||
|
||||
writer.flush()?;
|
||||
|
||||
// - bitmap
|
||||
writer.write_all(&bloom.bitmap())?;
|
||||
|
||||
writer.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in New Issue