Bloom Filter
Space | Time | |||
---|---|---|---|---|
Access | Lookup | Insertion | Deletion | |
O(n) | O(1) |
| O(k) |
|
Definition​
- Short
- Detailed
Bloom Filter is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set.
Simplified
Bloom Filter is like a super-efficient memory that can quickly check if an item is in a set. It can say "probably yes" or "definitely no". The "probably yes" can sometimes be wrong, but "definitely no" is always right. For example, your browser uses it to check if a URL is safe.
Bloom Filter (invented by Burton Howard Bloom) is a fast, memory-efficient data structure that checks if an element is in a set. It can return false positives but never false negatives. It's useful when conventional hashing would require too much memory.
The filter is a bit array of m
bits, all set to 0 initially. It uses k
hash functions that map an element to one of the m
array positions. The choice of k
and m
depends on the desired
false positive rate.
The filter supports two operations: insertion and search. Deletion isn't possible. When inserting, the hash functions create indexes and set the corresponding bits to true. When searching, if all hashed indexes are true, the element might be in the set. If any index is false, the element is definitely not in the set.
False positive probability depends on the filter size, number of hash functions, and number of inserted items.
It's calculated as:
(1-e-kn/m)k
k
is the number of hash functionsm
is the filter sizen
is the number of items inserted
These parameters should be chosen based on the acceptable false positive rate.
Practice​
- Practice
- Solution
Aspect | Pseudo Code |
---|---|
Insertion |
|
May Contain |
|
package main
type BloomFilter struct {
size int
storage []bool
}
func NewBloomFilter(size int) *BloomFilter {
return &BloomFilter{
size: size,
storage: make([]bool, size),
}
}
func (bf *BloomFilter) insert(item string) {
hashValues := bf.getHashValues(item)
for _, val := range hashValues {
bf.storage[val] = true
}
}
func (bf *BloomFilter) mayContain(item string) bool {
hashValues := bf.getHashValues(item)
for _, hashIndex := range hashValues {
if !bf.storage[hashIndex] {
return false
}
}
return true
}
func (bf *BloomFilter) getHashValues(item string) []int {
return []int{bf.firstHash(item), bf.secondHash(item), bf.thirdHash(item)}
}
func (bf *BloomFilter) firstHash(item string) int {
hash := 0
for _, char := range item {
hash = (hash << 5) + hash + int(char)
hash &= hash
hash = abs(hash)
}
return hash % bf.size
}
func (bf *BloomFilter) secondHash(item string) int {
hash := 5381
for _, char := range item {
hash = (hash << 5) + hash + int(char)
}
return abs(hash) % bf.size
}
func (bf *BloomFilter) thirdHash(item string) int {
hash := 0
for _, char := range item {
hash = (hash << 5) - hash
hash += int(char)
hash &= hash
}
return abs(hash) % bf.size
}
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
import java.util.function.IntUnaryOperator;
public class BloomFilter {
private final int size;
private final boolean[] storage;
public BloomFilter(int size) {
this.size = size;
this.storage = createStore(size);
}
public void insert(String item) {
int[] hashValues = getHashValues(item);
for (int val : hashValues) {
setValue(val);
}
}
public boolean mayContain(String item) {
int[] hashValues = getHashValues(item);
for (int hashIndex : hashValues) {
if (!getValue(hashIndex)) {
return false;
}
}
return true;
}
private boolean[] createStore(int size) {
boolean[] storage = new boolean[size];
return storage;
}
private void setValue(int index) {
storage[index] = true;
}
private boolean getValue(int index) {
return storage[index];
}
private int firstHash(String item) {
int hash = 0;
for (int charIndex = 0; charIndex < item.length(); charIndex++) {
char character = item.charAt(charIndex);
hash = (hash << 5) + hash + character;
hash &= hash;
hash = Math.abs(hash);
}
return hash % size;
}
private int secondHash(String item) {
int hash = 5381;
for (int charIndex = 0; charIndex < item.length(); charIndex++) {
char character = item.charAt(charIndex);
hash = (hash << 5) + hash + character;
}
return Math.abs(hash % size);
}
private int thirdHash(String item) {
int hash = 0;
for (int charIndex = 0; charIndex < item.length(); charIndex++) {
char character = item.charAt(charIndex);
hash = (hash << 5) - hash;
hash += character;
hash &= hash;
}
return Math.abs(hash % size);
}
private int[] getHashValues(String item) {
return new int[]{firstHash(item), secondHash(item), thirdHash(item)};
}
}
class BloomFilter {
constructor(size = 100) {
this.size = size;
this.storage = this.createStore(size);
}
insert(item) {
const hashValues = this.getHashValues(item);
hashValues.forEach((val) => this.storage.setValue(val));
}
mayContain(item) {
const hashValues = this.getHashValues(item);
for (let hashIndex = 0; hashIndex < hashValues.length; hashIndex += 1) {
if (!this.storage.getValue(hashValues[hashIndex])) {
return false;
}
}
return true;
}
createStore(size) {
const storage = [];
for (
let storageCellIndex = 0;
storageCellIndex < size;
storageCellIndex += 1
) {
storage.push(false);
}
const storageInterface = {
getValue(index) {
return storage[index];
},
setValue(index) {
storage[index] = true;
},
};
return storageInterface;
}
firstHash(item) {
let hash = 0;
for (let charIndex = 0; charIndex < item.length; charIndex += 1) {
const char = item.charCodeAt(charIndex);
hash = (hash << 5) + hash + char;
hash &= hash;
hash = Math.abs(hash);
}
return hash % this.size;
}
secondHash(item) {
let hash = 5381;
for (let charIndex = 0; charIndex < item.length; charIndex += 1) {
const char = item.charCodeAt(charIndex);
hash = (hash << 5) + hash + char;
}
return Math.abs(hash % this.size);
}
thirdHash(item) {
let hash = 0;
for (let charIndex = 0; charIndex < item.length; charIndex += 1) {
const char = item.charCodeAt(charIndex);
hash = (hash << 5) - hash;
hash += char;
hash &= hash;
}
return Math.abs(hash % this.size);
}
getHashValues(item) {
return [this.firstHash(item), this.secondHash(item), this.thirdHash(item)];
}
}
class BloomFilter(size: Int = 100) {
private val size: Int = size
private val storage: BooleanArray = createStore(size)
fun insert(item: String) {
val hashValues = getHashValues(item)
hashValues.forEach { storage[it] = true }
}
fun mayContain(item: String): Boolean {
val hashValues = getHashValues(item)
for (hashIndex in hashValues.indices) {
if (!storage[hashValues[hashIndex]]) {
return false
}
}
return true
}
private fun createStore(size: Int): BooleanArray {
return BooleanArray(size) { false }
}
private fun firstHash(item: String): Int {
var hash = 0
for (charIndex in item.indices) {
val char = item[charIndex].toInt()
hash = (hash shl 5) + hash + char
hash = hash and hash
hash = hash.absoluteValue
}
return hash % size
}
private fun secondHash(item: String): Int {
var hash = 5381
for (charIndex in item.indices) {
val char = item[charIndex].toInt()
hash = (hash shl 5) + hash + char
}
return hash.absoluteValue % size
}
private fun thirdHash(item: String): Int {
var hash = 0
for (charIndex in item.indices) {
val char = item[charIndex].toInt()
hash = (hash shl 5) - hash
hash += char
hash = hash and hash
}
return hash.absoluteValue % size
}
private fun getHashValues(item: String): IntArray {
return intArrayOf(firstHash(item), secondHash(item), thirdHash(item))
}
}
class BloomFilter:
def __init__(self, size=100):
self.size = size
self.storage = self.create_store(size)
def insert(self, item):
hash_values = self.get_hash_values(item)
for val in hash_values:
self.storage[val] = True
def may_contain(self, item):
hash_values = self.get_hash_values(item)
for hash_index in range(len(hash_values)):
if not self.storage[hash_values[hash_index]]:
return False
return True
def create_store(self, size):
return [False] * size
def first_hash(self, item):
hash_val = 0
for char_index in range(len(item)):
char = ord(item[char_index])
hash_val = (hash_val << 5) + hash_val + char
hash_val &= hash_val
hash_val = abs(hash_val)
return hash_val % self.size
def second_hash(self, item):
hash_val = 5381
for char_index in range(len(item)):
char = ord(item[char_index])
hash_val = (hash_val << 5) + hash_val + char
return abs(hash_val % self.size)
def third_hash(self, item):
hash_val = 0
for char_index in range(len(item)):
char = ord(item[char_index])
hash_val = (hash_val << 5) - hash_val
hash_val += char
hash_val &= hash_val
return abs(hash_val % self.size)
def get_hash_values(self, item):
return [self.first_hash(item), self.second_hash(item), self.third_hash(item)]
struct BloomFilter {
size: usize,
storage: Vec<bool>,
}
impl BloomFilter {
fn new(size: usize) -> Self {
BloomFilter {
size,
storage: vec![false; size],
}
}
fn insert(&mut self, item: &str) {
let hash_values = self.get_hash_values(item);
for val in hash_values {
self.storage[val] = true;
}
}
fn may_contain(&self, item: &str) -> bool {
let hash_values = self.get_hash_values(item);
for &hash_index in &hash_values {
if !self.storage[hash_index] {
return false;
}
}
true
}
fn create_store(&self) -> Vec<bool> {
vec![false; self.size]
}
fn first_hash(&self, item: &str) -> usize {
let mut hash = 0;
for char in item.chars() {
let char_code = char as usize;
hash = (hash << 5) + hash + char_code;
hash &= hash;
hash = hash.abs();
}
hash % self.size
}
fn second_hash(&self, item: &str) -> usize {
let mut hash = 5381;
for char in item.chars() {
let char_code = char as usize;
hash = (hash << 5) + hash + char_code;
}
hash.abs() % self.size
}
fn third_hash(&self, item: &str) -> usize {
let mut hash = 0;
for char in item.chars() {
let char_code = char as usize;
hash = (hash << 5) - hash;
hash += char_code;
hash &= hash;
}
hash.abs() % self.size
}
fn get_hash_values(&self, item: &str) -> Vec<usize> {
vec![
self.first_hash(item),
self.second_hash(item),
self.third_hash(item),
]
}
}
class BloomFilter {
private size: number;
private storage: { [index: number]: boolean };
constructor(size: number = 100) {
this.size = size;
this.storage = this.createStore(size);
}
insert(item: string): void {
const hashValues = this.getHashValues(item);
hashValues.forEach((val) => {
this.storage[val] = true;
});
}
mayContain(item: string): boolean {
const hashValues = this.getHashValues(item);
for (let hashIndex = 0; hashIndex < hashValues.length; hashIndex += 1) {
if (!this.storage[hashValues[hashIndex]]) {
return false;
}
}
return true;
}
private createStore(size: number): { [index: number]: boolean } {
const storage: { [index: number]: boolean } = {};
for (
let storageCellIndex = 0;
storageCellIndex < size;
storageCellIndex += 1
) {
storage[storageCellIndex] = false;
}
return storage;
}
private firstHash(item: string): number {
let hash = 0;
for (let charIndex = 0; charIndex < item.length; charIndex += 1) {
const char = item.charCodeAt(charIndex);
hash = (hash << 5) + hash + char;
hash &= hash;
hash = Math.abs(hash);
}
return hash % this.size;
}
private secondHash(item: string): number {
let hash = 5381;
for (let charIndex = 0; charIndex < item.length; charIndex += 1) {
const char = item.charCodeAt(charIndex);
hash = (hash << 5) + hash + char;
}
return Math.abs(hash % this.size);
}
private thirdHash(item: string): number {
let hash = 0;
for (let charIndex = 0; charIndex < item.length; charIndex += 1) {
const char = item.charCodeAt(charIndex);
hash = (hash << 5) - hash;
hash += char;
hash &= hash;
}
return Math.abs(hash % this.size);
}
private getHashValues(item: string): number[] {
return [this.firstHash(item), this.secondHash(item), this.thirdHash(item)];
}
}