1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
//! Implementation of a bloom filter.
use sha2::{Digest, Sha256};
/// Simple implementation of a Bloom Filter. Which is guaranteed to return 1 if an element
/// is in the set, but returns 1 with probability p (settable) if an item is not in the
/// set. Does not reveal what is in the set.
#[derive(Debug, PartialEq, PartialOrd)]
pub struct BloomFilter {
bits: Vec<bool>,
nhashes: usize,
}
impl BloomFilter {
/// Create a new BloomFilter with `size` entries, using `nhashes` hash functions.
pub fn new(size: usize, nhashes: usize) -> Self {
BloomFilter {
bits: vec![false; size],
nhashes,
}
}
/// Compute required expansion for false positive probability `p`.
///
/// That is - if you plan to insert `n` items into the BloomFilter, and want a false
/// positive probability of `p`, then you should set the BloomFilter size to
/// `compute_expansion(p) * n`.
pub fn compute_expansion(p: f64) -> f64 {
-1.44 * p.log2()
}
/// Compute required number of hash functions for false positive probability `p`.
pub fn compute_nhashes(p: f64) -> usize {
(-p.log2()).ceil() as usize
}
/// Create a new BloomFilter with false positive probability `p` which can support up
/// to `n` insertions.
pub fn with_false_positive_prob(p: f64, n: usize) -> Self {
Self::new(
(Self::compute_expansion(p) * n as f64).ceil() as usize,
Self::compute_nhashes(p),
)
}
/// Get the number of bins in this BloomFilter.
pub fn len(&self) -> usize {
self.bits.len()
}
/// Get the number of hash functions in this BloomFilter.
pub fn nhashes(&self) -> usize {
self.nhashes
}
/// Get bloom filter bins.
pub fn bins(&self) -> &[bool] {
&self.bits
}
/// Get bloom filter bins packed in bytes.
pub fn as_bytes(&self) -> Vec<u8> {
crate::utils::pack_bits(self.bins())
}
/// Create bloom filter from bytes.
pub fn from_bytes(bytes: &[u8], size: usize, nhashes: usize) -> Self {
let bits = crate::utils::unpack_bits(bytes, size);
BloomFilter { bits, nhashes }
}
/// Compute the bin that this value would go to in a BloomFilter.
///
/// Result must be modded by the actual size of the bloom filter to avoid out of
/// bounds errors.
pub fn bin<V: AsRef<[u8]>>(value: &V, hash_index: usize) -> usize {
// TODO: This code probably needs to use fixed-size integer types in order to be portable to
// 32-bit architectures.
debug_assert_eq!(std::mem::size_of::<usize>(), 8);
let mut h = Sha256::new();
h.update((hash_index as u64).to_le_bytes());
h.update(value);
let hbytes = h.finalize();
u64::from_le_bytes(
<[u8; 8]>::try_from(&hbytes[0..8]).expect("We're getting 8 bytes specifically"),
) as usize
}
/// Insert an item into the BloomFilter.
pub fn insert<V: AsRef<[u8]>>(&mut self, value: &V) {
for hash_index in 0..self.nhashes {
let i = Self::bin(value, hash_index) % self.len();
self.bits[i] = true;
}
}
/// Check whether an item exists in the BloomFilter.
pub fn contains<V: AsRef<[u8]>>(&mut self, value: &V) -> bool {
(0..self.nhashes).all(|hash_index| {
let i = Self::bin(value, hash_index) % self.len();
self.bits[i]
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{AesRng, Block};
use rand::Rng;
#[test]
fn test_bloom_filter_membership() {
let mut rng = AesRng::new();
let n = 1000;
let nhashes = 3;
let mut filter = BloomFilter::new(n, nhashes);
for _ in 0..128 {
let x = rng.gen::<Block>();
filter.insert(&x);
assert!(filter.contains(&x));
}
assert_eq!(
filter,
BloomFilter::from_bytes(&filter.as_bytes(), n, nhashes)
);
}
}