diff options
author | Simon Sapin <simon.sapin@exyr.org> | 2015-02-21 20:58:11 +0100 |
---|---|---|
committer | Simon Sapin <simon.sapin@exyr.org> | 2015-02-23 16:29:34 +0100 |
commit | 2a50755c8a751218c8e3d5de53c831deaf061e64 (patch) | |
tree | d82d835f99d9d0dd333785da3928ae91326484c5 /components/util/bloom.rs | |
parent | 2e1adb3fa670504fb0fedaa517f312ba233bf67b (diff) | |
download | servo-2a50755c8a751218c8e3d5de53c831deaf061e64.tar.gz servo-2a50755c8a751218c8e3d5de53c831deaf061e64.zip |
Move selector matching to an external library, for use outside Servo.
Diffstat (limited to 'components/util/bloom.rs')
-rw-r--r-- | components/util/bloom.rs | 338 |
1 files changed, 0 insertions, 338 deletions
diff --git a/components/util/bloom.rs b/components/util/bloom.rs deleted file mode 100644 index 6fd5d17e775..00000000000 --- a/components/util/bloom.rs +++ /dev/null @@ -1,338 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -//! Simple counting bloom filters. - -use string_cache::{Atom, Namespace}; - -const KEY_SIZE: uint = 12; -const ARRAY_SIZE: uint = 1 << KEY_SIZE; -const KEY_MASK: u32 = (1 << KEY_SIZE) - 1; -const KEY_SHIFT: uint = 16; - -/// A counting Bloom filter with 8-bit counters. For now we assume -/// that having two hash functions is enough, but we may revisit that -/// decision later. -/// -/// The filter uses an array with 2**KeySize entries. -/// -/// Assuming a well-distributed hash function, a Bloom filter with -/// array size M containing N elements and -/// using k hash function has expected false positive rate exactly -/// -/// $ (1 - (1 - 1/M)^{kN})^k $ -/// -/// because each array slot has a -/// -/// $ (1 - 1/M)^{kN} $ -/// -/// chance of being 0, and the expected false positive rate is the -/// probability that all of the k hash functions will hit a nonzero -/// slot. -/// -/// For reasonable assumptions (M large, kN large, which should both -/// hold if we're worried about false positives) about M and kN this -/// becomes approximately -/// -/// $$ (1 - \exp(-kN/M))^k $$ -/// -/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, -/// or in other words -/// -/// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ -/// -/// where r is the false positive rate. This can be used to compute -/// the desired KeySize for a given load N and false positive rate r. -/// -/// If N/M is assumed small, then the false positive rate can -/// further be approximated as 4*N^2/M^2. So increasing KeySize by -/// 1, which doubles M, reduces the false positive rate by about a -/// factor of 4, and a false positive rate of 1% corresponds to -/// about M/N == 20. -/// -/// What this means in practice is that for a few hundred keys using a -/// KeySize of 12 gives false positive rates on the order of 0.25-4%. -/// -/// Similarly, using a KeySize of 10 would lead to a 4% false -/// positive rate for N == 100 and to quite bad false positive -/// rates for larger N. -pub struct BloomFilter { - counters: [u8; ARRAY_SIZE], -} - -impl Clone for BloomFilter { - #[inline] - fn clone(&self) -> BloomFilter { - BloomFilter { - counters: self.counters, - } - } -} - -impl BloomFilter { - /// Creates a new bloom filter. - #[inline] - pub fn new() -> BloomFilter { - BloomFilter { - counters: [0; ARRAY_SIZE], - } - } - - #[inline] - fn first_slot(&self, hash: u32) -> &u8 { - &self.counters[hash1(hash) as uint] - } - - #[inline] - fn first_mut_slot(&mut self, hash: u32) -> &mut u8 { - &mut self.counters[hash1(hash) as uint] - } - - #[inline] - fn second_slot(&self, hash: u32) -> &u8 { - &self.counters[hash2(hash) as uint] - } - - #[inline] - fn second_mut_slot(&mut self, hash: u32) -> &mut u8 { - &mut self.counters[hash2(hash) as uint] - } - - #[inline] - pub fn clear(&mut self) { - self.counters = [0; ARRAY_SIZE] - } - - #[inline] - fn insert_hash(&mut self, hash: u32) { - { - let slot1 = self.first_mut_slot(hash); - if !full(slot1) { - *slot1 += 1 - } - } - { - let slot2 = self.second_mut_slot(hash); - if !full(slot2) { - *slot2 += 1 - } - } - } - - /// Inserts an item into the bloom filter. - #[inline] - pub fn insert<T:BloomHash>(&mut self, elem: &T) { - self.insert_hash(elem.bloom_hash()) - - } - - #[inline] - fn remove_hash(&mut self, hash: u32) { - { - let slot1 = self.first_mut_slot(hash); - if !full(slot1) { - *slot1 -= 1 - } - } - { - let slot2 = self.second_mut_slot(hash); - if !full(slot2) { - *slot2 -= 1 - } - } - } - - /// Removes an item from the bloom filter. - #[inline] - pub fn remove<T:BloomHash>(&mut self, elem: &T) { - self.remove_hash(elem.bloom_hash()) - } - - #[inline] - fn might_contain_hash(&self, hash: u32) -> bool { - *self.first_slot(hash) != 0 && *self.second_slot(hash) != 0 - } - - /// Check whether the filter might contain an item. This can - /// sometimes return true even if the item is not in the filter, - /// but will never return false for items that are actually in the - /// filter. - #[inline] - pub fn might_contain<T:BloomHash>(&self, elem: &T) -> bool { - self.might_contain_hash(elem.bloom_hash()) - } -} - -pub trait BloomHash { - fn bloom_hash(&self) -> u32; -} - -impl BloomHash for int { - #[allow(exceeding_bitshifts)] - #[inline] - fn bloom_hash(&self) -> u32 { - ((*self >> 32) ^ *self) as u32 - } -} - -impl BloomHash for uint { - #[allow(exceeding_bitshifts)] - #[inline] - fn bloom_hash(&self) -> u32 { - ((*self >> 32) ^ *self) as u32 - } -} - -impl BloomHash for Atom { - #[inline] - fn bloom_hash(&self) -> u32 { - ((self.data >> 32) ^ self.data) as u32 - } -} - -impl BloomHash for Namespace { - #[inline] - fn bloom_hash(&self) -> u32 { - let Namespace(ref atom) = *self; - atom.bloom_hash() - } -} - -#[inline] -fn full(slot: &u8) -> bool { - *slot == 0xff -} - -#[inline] -fn hash1(hash: u32) -> u32 { - hash & KEY_MASK -} - -#[inline] -fn hash2(hash: u32) -> u32 { - (hash >> KEY_SHIFT) & KEY_MASK -} - -#[test] -fn create_and_insert_some_stuff() { - use std::iter::range; - - let mut bf = BloomFilter::new(); - - for i in range(0u, 1000) { - bf.insert(&i); - } - - for i in range(0u, 1000) { - assert!(bf.might_contain(&i)); - } - - let false_positives = - range(1001u, 2000).filter(|i| bf.might_contain(i)).count(); - - assert!(false_positives < 10); // 1%. - - for i in range(0u, 100) { - bf.remove(&i); - } - - for i in range(100u, 1000) { - assert!(bf.might_contain(&i)); - } - - let false_positives = range(0u, 100).filter(|i| bf.might_contain(i)).count(); - - assert!(false_positives < 2); // 2%. - - bf.clear(); - - for i in range(0u, 2000) { - assert!(!bf.might_contain(&i)); - } -} - -#[cfg(test)] -mod bench { - extern crate test; - - use std::hash::{hash, SipHasher}; - use std::iter; - use super::BloomFilter; - - #[bench] - fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) { - b.iter(|| { - let mut bf = BloomFilter::new(); - for i in iter::range(0u, 1000) { - bf.insert(&i); - } - for i in iter::range(0u, 100) { - bf.remove(&i); - } - for i in iter::range(100u, 200) { - test::black_box(bf.might_contain(&i)); - } - }); - } - - #[bench] - fn might_contain(b: &mut test::Bencher) { - let mut bf = BloomFilter::new(); - - for i in iter::range(0u, 1000) { - bf.insert(&i); - } - - let mut i = 0u; - - b.bench_n(1000, |b| { - b.iter(|| { - test::black_box(bf.might_contain(&i)); - i += 1; - }); - }); - } - - #[bench] - fn insert(b: &mut test::Bencher) { - let mut bf = BloomFilter::new(); - - b.bench_n(1000, |b| { - let mut i = 0u; - - b.iter(|| { - test::black_box(bf.insert(&i)); - i += 1; - }); - }); - } - - #[bench] - fn remove(b: &mut test::Bencher) { - let mut bf = BloomFilter::new(); - for i in range(0u, 1000) { - bf.insert(&i); - } - - b.bench_n(1000, |b| { - let mut i = 0u; - - b.iter(|| { - bf.remove(&i); - i += 1; - }); - }); - - test::black_box(bf.might_contain(&0u)); - } - - #[bench] - fn hash_a_uint(b: &mut test::Bencher) { - let mut i = 0u; - b.iter(|| { - test::black_box(hash::<uint, SipHasher>(&i)); - i += 1; - }) - } -} |