Bloom filter code moved to separate repository

This commit is contained in:
Christian Reitter 2024-12-15 14:31:37 +01:00
parent 6028c4794f
commit 58367cbe7f
4 changed files with 9 additions and 1576 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +0,0 @@
[package]
name = "bloom-filter-generator"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
hex = "0.4.3"
serde = "1.0"
serde_derive = "1.0"
rayon = "1.7.0"
clap = {version = "4.0.32", features = ["derive"]}
bloomfilter = "1"

View File

@ -0,0 +1,9 @@
# bloom-filter-generator
This code moved to https://git.distrust.co/milksad/rust-bloom-filter-generator.
## Usage note
Make sure to use a suitable parameter value for bloom filter size that corresponds to the input data set.
Experiment with the false positive factor parameter for memory footprint vs. accuracy trade-off.

View File

@ -1,75 +0,0 @@
use std::error::Error;
use std::fs::File;
use std::io::{BufWriter, BufReader, BufRead, Write};
use std::path::PathBuf;
use bloomfilter::Bloom;
use clap::{arg, Parser};
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Opts {
#[arg(short, long, help = "Input file (sha256 hashes csv)")]
input_file: PathBuf,
#[arg(short, long, help = "Output file (bloom filter dump of sha256 hashes)")]
output_file: PathBuf,
#[arg(
long,
help = "Bloom filter: number of items",
default_value_t = 13_194_396_000
)]
num_items: usize,
#[arg(
long,
help = "Bloom filter: wanted rate of false positives",
default_value_t = 0.000_001
)]
fp_rate: f64,
}
fn main() -> Result<(), Box<dyn Error>> {
let opts: Opts = Opts::parse();
let mut bloom = Bloom::new_for_fp_rate(opts.num_items, opts.fp_rate);
let reader = BufReader::new(File::open(&opts.input_file)?).lines();
let writer = File::create(&opts.output_file)
.expect("error opening output file");
let mut writer = BufWriter::new(writer);
for (count, line) in reader.enumerate() {
if count & 0b1_1111_1111_1111_1111_1111 == 0 {
println!("Read {} lines", count);
}
match line {
Ok(hash) => {
bloom.set(&hash)
}
Err(err) => eprintln!("Error reading line record: {}", err)
}
}
// Dump bloom filter to file
println!("Serializing bloom filter to output file");
// - metadata
writer.write_all(&bloom.number_of_bits().to_be_bytes())?;
writer.write_all(&bloom.number_of_hash_functions().to_be_bytes())?;
writer.write_all(&bloom.sip_keys()[0].0.to_be_bytes())?;
writer.write_all(&bloom.sip_keys()[0].1.to_be_bytes())?;
writer.write_all(&bloom.sip_keys()[1].0.to_be_bytes())?;
writer.write_all(&bloom.sip_keys()[1].1.to_be_bytes())?;
writer.flush()?;
// - bitmap
writer.write_all(&bloom.bitmap())?;
writer.flush()?;
Ok(())
}