[Algorithm] Bloom Filter 布隆过滤器 / Cuckoo Filter 布谷鸟过滤器

背景故事什么的没有。

Alice

Bloom Filter 布隆过滤器

用于判断集合中是否存在某个值，但有可能误判（将不存在判断为存在）

原理类似 Hash Table，开一个大小为 $n$ 的 bitset，设计 $k$ 个 Hash 函数，插入元素时将 bitset 中 Hash 值对应的位设为 $1$ 。查询时若 $k$ 个位置均为 $1$ 则认为存在。

优点：内存占用小，时间复杂度稳定 $O(k)$
缺点：不能删除（如果需要删除可以将 bitset 改为普通计数数组，但这样内存优势就不大了），存在误判

Bloom Filter Arguments Analysis

考虑插入 $m$ 个元素后，某个位置为 $1$ 的概率为

$p_1 = 1 - p_0 = 1 - (1 - \frac1n)^{mk} \approx 1 - e^{-\frac{mk}{n}}$

其中用到 $\lim\limits_{n\rightarrow +\infty}(1 - \frac{1}{n})^{-n} = e$

此时再插入一个元素，且这个元素本不在集合中，误判（ $k$ 次 Hash 结果对应位均为 $1$ ）的概率为

$p = p_1^k = (1 - e^{-\frac{mk}{n}})^k$

可以求导求最小值，也可以稍微变换一下：

$p = e^{k\ln(1 - e^{-\frac{mk}{n}})} = e^{-\frac{n}{m}\ln a \ln(1 - a)}$

$a = e^{-\frac{mk}{n}}$

当 $a = 1 - a$ ，即 $k = \frac{n}{m}\ln 2$ 时误判率 $p = e^{-\frac nm \ln^22}$ 最小

同时也可以解得 $n = -\frac{m\ln p}{\ln^22}$ ，可以根据需求决定 $n$ 的大小

Bloom Filter Implementation

根据自己的理解简单使用 C++ 实现了一个 std::string 的 Bloom Filter

不是线程安全的，想做的话加锁非常简单

话说我一直在想对于简单的申请数组情况到底该不该用智能指针呢，感觉完全没必要，很麻烦，然而网上又推崇所有指针都用智能指针

#include <algorithm>
#include <bitset>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <functional>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <vector>

namespace bloom {

static std::mt19937_64
    randnum_gen_(std::chrono::system_clock::now().time_since_epoch().count());

const size_t kMaxBitmapSize = UINT32_MAX;
const double kDefaultMisProbability = 0.2f;
const double kLn2 = std::log(2);
const size_t kHashBase = 13331;

auto create_hash_function(const size_t seed, const size_t range)
    -> std::function<size_t(const std::string &)> {
  static const auto original_function = [&](const size_t seed,
                                            const size_t range,
                                            const std::string &s) -> size_t {
    size_t result = seed;
    for (const auto c : s) {
      result = result * kHashBase + c;
    }
    return result % range;
  };
  return std::bind(original_function, seed, range, std::placeholders::_1);
}

class Bitmap {
private:
  const size_t size_;

  const size_t len_;

  uint8_t *bitmap_;

public:
  explicit Bitmap(const size_t size) : size_(size), len_((size + 7) >> 3) {
    assert(size > 0);
    bitmap_ = new uint8_t[len_];
    memset(bitmap_, 0, len_);
  }

  ~Bitmap() { delete[] bitmap_; }

  Bitmap(const Bitmap &) = delete;
  Bitmap &operator=(const Bitmap &) = delete;

  auto size() const noexcept -> size_t { return size_; }

  void Set(const size_t pos) {
    assert(pos < size_);
    bitmap_[pos >> 3] |= 1 << (pos & 7);
  }

  auto Get(const size_t pos) const -> int {
    assert(pos < size_);
    return bitmap_[pos >> 3] >> (pos & 7) & 1;
  }
};

class BloomFilter {
private:
  size_t element_size_;

  size_t bitmap_size_;

  size_t hash_num_;

  // Misjudgement Probability
  double mis_probability_;

  std::vector<std::function<size_t(const std::string &)>> hash_function_;

  Bitmap bitmap_;

public:
  explicit BloomFilter(const size_t element_size, const size_t bitmap_size,
                       const size_t hash_num = 0)
      : element_size_(element_size), bitmap_size_(bitmap_size),
        hash_num_(hash_num),
        mis_probability_(std::exp(-static_cast<double>(bitmap_size) /
                                  element_size * kLn2 * kLn2)),
        bitmap_(bitmap_size) {
    if (hash_num == 0) {
      hash_num_ =
          std::max(1ULL, static_cast<size_t>(static_cast<double>(bitmap_size) /
                                             element_size * kLn2));
    }
    for (size_t i = 0; i < hash_num_; ++i) {
      hash_function_.push_back(
          create_hash_function(randnum_gen_(), bitmap_size));
    }
  }

  ~BloomFilter() {}

  BloomFilter(const BloomFilter &) = delete;
  BloomFilter &operator=(const BloomFilter &) = delete;

  auto element_size() const noexcept -> size_t { return element_size_; }

  auto bitmap_size() const noexcept -> size_t { return bitmap_size_; }

  auto hash_num() const noexcept -> size_t { return hash_num_; }

  auto mis_probability() const noexcept -> double { return mis_probability_; }

  void Insert(const std::string &s) {
    for (const auto &func : hash_function_) {
      bitmap_.Set(func(s));
    }
  }

  auto Contains(const std::string &s) const -> bool {
    for (const auto &func : hash_function_) {
      if (bitmap_.Get(func(s)) == 0) {
        return false;
      }
    }
    return true;
  }
};
} // namespace bloom

int main() {
  bloom::BloomFilter set(10, 100);
  std::cout << set.hash_num() << std::endl;
  // 6
  std::cout << set.mis_probability() << std::endl;
  // 0.00819255

  std::vector<std::string> strs = {
      "Alice", "Bob", "Carol", "Tairitsu", "Hikari", "Mizuki", "A", "B", "C"};
  for (const auto &s : strs) {
    set.Insert(s);
  }

  std::cout << set.Contains("Alice") << std::endl;
  // 1
  std::cout << set.Contains("alice") << std::endl;
  // 0
  return 0;
}