Source/WTF/wtf/BloomFilter.h - WebKit - Git at Google

 /*
  * Copyright (C) 2011-2019 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */

 #pragma once

 #include <array>
 #include <wtf/text/AtomString.h>

 namespace WTF {

 // Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory.
 // False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique
 // keys and m is the table size (==2^keyBits).
 // See http://en.wikipedia.org/wiki/Bloom_filter
 template <unsigned keyBits>
 class BloomFilter {
     WTF_MAKE_FAST_ALLOCATED;
 public:
     static constexpr size_t tableSize = 1 << keyBits;

     BloomFilter();

     void add(unsigned hash);
     // For example SHA1::Digest.
     template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&);

     void add(const BloomFilter&);

     // The filter may give false positives (claim it may contain a key it doesn't)
     // but never false negatives (claim it doesn't contain a key it does).
     bool mayContain(unsigned hash) const;
     template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const;

     void clear();

     void add(const AtomString& string) { add(string.impl()->existingHash()); }
     void add(const String& string) { add(string.impl()->hash()); }
     bool mayContain(const AtomString& string) const { return mayContain(string.impl()->existingHash()); }
     bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }

 private:
     static constexpr unsigned bitsPerPosition = 8 * sizeof(unsigned);
     static constexpr unsigned keyMask = (1 << keyBits) - 1;
     static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; }
     static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); }
     template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&);

     bool isBitSet(unsigned key) const;
     void setBit(unsigned key);

     std::array<unsigned, tableSize / bitsPerPosition> m_bitArray;
 };

 template <unsigned keyBits>
 inline BloomFilter<keyBits>::BloomFilter()
     : m_bitArray()
 {
 }

 template <unsigned keyBits>
 inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const
 {
     // The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions.
     // This works well as long as the filter size is not much above 2^16.
     return isBitSet(hash) && isBitSet(hash >> 16);
 }

 template <unsigned keyBits>
 inline void BloomFilter<keyBits>::add(unsigned hash)
 {
     setBit(hash);
     setBit(hash >> 16);
 }

 template <unsigned keyBits>
 template <size_t hashSize>
 inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash)
 {
     // We could use larger k value than 2 for long hashes.
     static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short");
     return {
         *reinterpret_cast<const unsigned*>(hash.data()),
         *reinterpret_cast<const unsigned*>(hash.data() + sizeof(unsigned))
     };
 }

 template <unsigned keyBits>
 template <size_t hashSize>
 inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const
 {
     auto keys = keysFromHash(hash);
     return isBitSet(keys.first) && isBitSet(keys.second);
 }

 template <unsigned keyBits>
 template <size_t hashSize>
 inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash)
 {
     auto keys = keysFromHash(hash);
     setBit(keys.first);
     setBit(keys.second);
 }

 template <unsigned keyBits>
 inline void BloomFilter<keyBits>::add(const BloomFilter& other)
 {
     for (size_t i = 0; i < m_bitArray.size(); ++i)
         m_bitArray[i] |= other.m_bitArray[i];
 }

 template <unsigned keyBits>
 bool BloomFilter<keyBits>::isBitSet(unsigned key) const
 {
     unsigned maskedKey = key & keyMask;
     ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
     return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey);
 }

 template <unsigned keyBits>
 void BloomFilter<keyBits>::setBit(unsigned key)
 {
     unsigned maskedKey = key & keyMask;
     ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
     m_bitArray[arrayIndex(maskedKey)] |= bitMask(maskedKey);
 }

 template <unsigned keyBits>
 inline void BloomFilter<keyBits>::clear()
 {
     m_bitArray.fill(0);
 }

 // Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above.
 // See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
 template <unsigned keyBits>
 class CountingBloomFilter {
     WTF_MAKE_FAST_ALLOCATED;
 public:
     static constexpr size_t tableSize = 1 << keyBits;
     static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); }

     CountingBloomFilter();

     void add(unsigned hash);
     void remove(unsigned hash);

     // The filter may give false positives (claim it may contain a key it doesn't)
     // but never false negatives (claim it doesn't contain a key it does).
     bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); }

     // The filter must be cleared before reuse even if all keys are removed.
     // Otherwise overflowed keys will stick around.
     void clear();

     void add(const AtomString& string) { add(string.impl()->existingHash()); }
     void add(const String& string) { add(string.impl()->hash()); }
     void remove(const AtomString& string) { remove(string.impl()->existingHash()); }
     void remove(const String& string) { remove(string.impl()->hash()); }

     bool mayContain(const AtomString& string) const { return mayContain(string.impl()->existingHash()); }
     bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }

 #if !ASSERT_DISABLED
     // Slow.
     bool likelyEmpty() const;
     bool isClear() const;
 #endif

 private:
     static constexpr unsigned keyMask = (1 << keyBits) - 1;

     uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; }
     uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; }
     const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; }
     const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; }

     std::array<uint8_t, tableSize> m_buckets;
 };

 template <unsigned keyBits>
 inline CountingBloomFilter<keyBits>::CountingBloomFilter()
     : m_buckets()
 {
 }

 template <unsigned keyBits>
 inline void CountingBloomFilter<keyBits>::add(unsigned hash)
 {
     auto& first = firstBucket(hash);
     auto& second = secondBucket(hash);
     if (LIKELY(first < maximumCount()))
         ++first;
     if (LIKELY(second < maximumCount()))
         ++second;
 }

 template <unsigned keyBits>
 inline void CountingBloomFilter<keyBits>::remove(unsigned hash)
 {
     auto& first = firstBucket(hash);
     auto& second = secondBucket(hash);
     ASSERT(first);
     ASSERT(second);
     // In case of an overflow, the bucket sticks in the table until clear().
     if (LIKELY(first < maximumCount()))
         --first;
     if (LIKELY(second < maximumCount()))
         --second;
 }

 template <unsigned keyBits>
 inline void CountingBloomFilter<keyBits>::clear()
 {
     m_buckets.fill(0);
 }

 #if !ASSERT_DISABLED
 template <unsigned keyBits>
 bool CountingBloomFilter<keyBits>::likelyEmpty() const
 {
     for (auto& bucket : m_buckets) {
         if (bucket && bucket != maximumCount())
             return false;
     }
     return true;
 }

 template <unsigned keyBits>
 bool CountingBloomFilter<keyBits>::isClear() const
 {
     for (auto& bucket : m_buckets) {
         if (bucket)
             return false;
     }
     return true;
 }
 #endif

 }

 using WTF::BloomFilter;
 using WTF::CountingBloomFilter;
	/*
	* Copyright (C) 2011-2019 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#pragma once

	#include <array>
	#include <wtf/text/AtomString.h>

	namespace WTF {

	// Bloom filter with k=2. Uses 2^keyBits/8 bytes of memory.
	// False positive rate is approximately (1-e^(-2n/m))^2, where n is the number of unique
	// keys and m is the table size (==2^keyBits).
	// See http://en.wikipedia.org/wiki/Bloom_filter
	template <unsigned keyBits>
	class BloomFilter {
	WTF_MAKE_FAST_ALLOCATED;
	public:
	static constexpr size_t tableSize = 1 << keyBits;

	BloomFilter();

	void add(unsigned hash);
	// For example SHA1::Digest.
	template <size_t hashSize> void add(const std::array<uint8_t, hashSize>&);

	void add(const BloomFilter&);

	// The filter may give false positives (claim it may contain a key it doesn't)
	// but never false negatives (claim it doesn't contain a key it does).
	bool mayContain(unsigned hash) const;
	template <size_t hashSize> bool mayContain(const std::array<uint8_t, hashSize>&) const;

	void clear();

	void add(const AtomString& string) { add(string.impl()->existingHash()); }
	void add(const String& string) { add(string.impl()->hash()); }
	bool mayContain(const AtomString& string) const { return mayContain(string.impl()->existingHash()); }
	bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }

	private:
	static constexpr unsigned bitsPerPosition = 8 * sizeof(unsigned);
	static constexpr unsigned keyMask = (1 << keyBits) - 1;
	static unsigned arrayIndex(unsigned key) { return key / bitsPerPosition; }
	static unsigned bitMask(unsigned key) { return 1 << (key % bitsPerPosition); }
	template <size_t hashSize> static std::pair<unsigned, unsigned> keysFromHash(const std::array<uint8_t, hashSize>&);

	bool isBitSet(unsigned key) const;
	void setBit(unsigned key);

	std::array<unsigned, tableSize / bitsPerPosition> m_bitArray;
	};

	template <unsigned keyBits>
	inline BloomFilter<keyBits>::BloomFilter()
	: m_bitArray()
	{
	}

	template <unsigned keyBits>
	inline bool BloomFilter<keyBits>::mayContain(unsigned hash) const
	{
	// The top and bottom bits of the incoming hash are treated as independent bloom filter hash functions.
	// This works well as long as the filter size is not much above 2^16.
	return isBitSet(hash) && isBitSet(hash >> 16);
	}

	template <unsigned keyBits>
	inline void BloomFilter<keyBits>::add(unsigned hash)
	{
	setBit(hash);
	setBit(hash >> 16);
	}

	template <unsigned keyBits>
	template <size_t hashSize>
	inline std::pair<unsigned, unsigned> BloomFilter<keyBits>::keysFromHash(const std::array<uint8_t, hashSize>& hash)
	{
	// We could use larger k value than 2 for long hashes.
	static_assert(hashSize >= 2 * sizeof(unsigned), "Hash array too short");
	return {
	reinterpret_cast<const unsigned>(hash.data()),
	reinterpret_cast<const unsigned>(hash.data() + sizeof(unsigned))
	};
	}

	template <unsigned keyBits>
	template <size_t hashSize>
	inline bool BloomFilter<keyBits>::mayContain(const std::array<uint8_t, hashSize>& hash) const
	{
	auto keys = keysFromHash(hash);
	return isBitSet(keys.first) && isBitSet(keys.second);
	}

	template <unsigned keyBits>
	template <size_t hashSize>
	inline void BloomFilter<keyBits>::add(const std::array<uint8_t, hashSize>& hash)
	{
	auto keys = keysFromHash(hash);
	setBit(keys.first);
	setBit(keys.second);
	}

	template <unsigned keyBits>
	inline void BloomFilter<keyBits>::add(const BloomFilter& other)
	{
	for (size_t i = 0; i < m_bitArray.size(); ++i)
	m_bitArray[i] \|= other.m_bitArray[i];
	}

	template <unsigned keyBits>
	bool BloomFilter<keyBits>::isBitSet(unsigned key) const
	{
	unsigned maskedKey = key & keyMask;
	ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
	return m_bitArray[arrayIndex(maskedKey)] & bitMask(maskedKey);
	}

	template <unsigned keyBits>
	void BloomFilter<keyBits>::setBit(unsigned key)
	{
	unsigned maskedKey = key & keyMask;
	ASSERT(arrayIndex(maskedKey) < m_bitArray.size());
	m_bitArray[arrayIndex(maskedKey)] \|= bitMask(maskedKey);
	}

	template <unsigned keyBits>
	inline void BloomFilter<keyBits>::clear()
	{
	m_bitArray.fill(0);
	}

	// Counting bloom filter with 8 bit counters. Uses 2^keyBits bytes of memory. Error rates as above.
	// See http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters
	template <unsigned keyBits>
	class CountingBloomFilter {
	WTF_MAKE_FAST_ALLOCATED;
	public:
	static constexpr size_t tableSize = 1 << keyBits;
	static unsigned maximumCount() { return std::numeric_limits<uint8_t>::max(); }

	CountingBloomFilter();

	void add(unsigned hash);
	void remove(unsigned hash);

	// The filter may give false positives (claim it may contain a key it doesn't)
	// but never false negatives (claim it doesn't contain a key it does).
	bool mayContain(unsigned hash) const { return firstBucket(hash) && secondBucket(hash); }

	// The filter must be cleared before reuse even if all keys are removed.
	// Otherwise overflowed keys will stick around.
	void clear();

	void add(const AtomString& string) { add(string.impl()->existingHash()); }
	void add(const String& string) { add(string.impl()->hash()); }
	void remove(const AtomString& string) { remove(string.impl()->existingHash()); }
	void remove(const String& string) { remove(string.impl()->hash()); }

	bool mayContain(const AtomString& string) const { return mayContain(string.impl()->existingHash()); }
	bool mayContain(const String& string) const { return mayContain(string.impl()->hash()); }

	#if !ASSERT_DISABLED
	// Slow.
	bool likelyEmpty() const;
	bool isClear() const;
	#endif

	private:
	static constexpr unsigned keyMask = (1 << keyBits) - 1;

	uint8_t& firstBucket(unsigned hash) { return m_buckets[hash & keyMask]; }
	uint8_t& secondBucket(unsigned hash) { return m_buckets[(hash >> 16) & keyMask]; }
	const uint8_t& firstBucket(unsigned hash) const { return m_buckets[hash & keyMask]; }
	const uint8_t& secondBucket(unsigned hash) const { return m_buckets[(hash >> 16) & keyMask]; }

	std::array<uint8_t, tableSize> m_buckets;
	};

	template <unsigned keyBits>
	inline CountingBloomFilter<keyBits>::CountingBloomFilter()
	: m_buckets()
	{
	}

	template <unsigned keyBits>
	inline void CountingBloomFilter<keyBits>::add(unsigned hash)
	{
	auto& first = firstBucket(hash);
	auto& second = secondBucket(hash);
	if (LIKELY(first < maximumCount()))
	++first;
	if (LIKELY(second < maximumCount()))
	++second;
	}

	template <unsigned keyBits>
	inline void CountingBloomFilter<keyBits>::remove(unsigned hash)
	{
	auto& first = firstBucket(hash);
	auto& second = secondBucket(hash);
	ASSERT(first);
	ASSERT(second);
	// In case of an overflow, the bucket sticks in the table until clear().
	if (LIKELY(first < maximumCount()))
	--first;
	if (LIKELY(second < maximumCount()))
	--second;
	}

	template <unsigned keyBits>
	inline void CountingBloomFilter<keyBits>::clear()
	{
	m_buckets.fill(0);
	}

	#if !ASSERT_DISABLED
	template <unsigned keyBits>
	bool CountingBloomFilter<keyBits>::likelyEmpty() const
	{
	for (auto& bucket : m_buckets) {
	if (bucket && bucket != maximumCount())
	return false;
	}
	return true;
	}

	template <unsigned keyBits>
	bool CountingBloomFilter<keyBits>::isClear() const
	{
	for (auto& bucket : m_buckets) {
	if (bucket)
	return false;
	}
	return true;
	}
	#endif

	}

	using WTF::BloomFilter;
	using WTF::CountingBloomFilter;