From 90e69a4c5497f1b1d9116cafc62d00cdbb98bec5 Mon Sep 17 00:00:00 2001 From: neingeist Date: Sun, 20 Apr 2014 17:10:07 +0200 Subject: [PATCH] sort of finish bloom filter --- bloom.c | 121 +++++++++++++++++++++----------------------------------- 1 file changed, 46 insertions(+), 75 deletions(-) diff --git a/bloom.c b/bloom.c index 85737a6..4867d04 100644 --- a/bloom.c +++ b/bloom.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -29,77 +30,8 @@ uint32_t fnv32_1_str(char *string) { return fnv32_1((uint8_t *) string, strlen(string)); } -/* http://en.wikipedia.org/wiki/MurmurHash */ -uint32_t Murmur3_32(uint8_t * key, size_t len, uint32_t seed) { - // Note: In this version, all integer arithmetic is performed with unsigned 32 bit integers. - // In the case of overflow, the result is constrained by the application of modulo 2^{32} arithmetic. - const uint32_t c1 = 0xcc9e2d51UL; - uint32_t c2 = 0x1b873593UL; - uint32_t r1 = 15; - uint32_t r2 = 13; - uint32_t m = 5; - uint32_t n = 0xe6546b64UL; - - uint32_t hash = seed; - - size_t i = 0; - - /* For each four-byte chunk of key */ - for (i = 0; i < len / 4; i++) { - /* FIXME endianness */ - uint32_t k = (key[i + 0] << 0) | (key[i + 1] << 8) | (key[i + 2] << 16) | (key[i + 3] << 24); - - k = k * c1; - k = (k << r1) | (k >> (32 - r1)); - k = k * c2; - - hash = hash ^ k; - hash = (hash << r2) | (hash >> (32 - r2)); - hash = hash * m + n; - } - - /* With any remaining bytes: */ - if (len > i * 4) { - size_t remaininglen = len - i * 4; - uint32_t remainingbytes = 0; - for (size_t j = 0; j < remaininglen; j++) { - remainingbytes = remainingbytes << 8; - remainingbytes |= key[len - 1 - j]; - } - - // remainingbytes \gets SwapEndianOrderOf(remainingbytesInKey) - // Note: Endian swapping is only necessary on big-endian machines. - // The purpose is to place the meaningful digits towards the low end of the value, - // so that these digits have the greatest potential to affect the low range digits - // in the subsequent multiplication. Consider that locating the meaningful digits - // in the high range would produce a greater effect upon the high digits of the - // multiplication, and notably, that such high digits are likely to be discarded - // by the modulo arithmetic under overflow. We don't want that. - remainingbytes = remainingbytes * c1; - remainingbytes = - (remainingbytes << r1) | (remainingbytes >> (32 - r1)); - remainingbytes = remainingbytes * c2; - - hash = hash ^ remainingbytes; - } - - hash = hash ^ len; - - hash = hash ^ (hash >> 16); - hash = hash * 0x85ebca6b; - hash = hash ^ (hash >> 13); - hash = hash * 0xc2b2ae35; - hash = hash ^ (hash >> 16); - - return hash; -} - -uint32_t Murmur3_32_str(char *string) { - return Murmur3_32((uint8_t *) string, strlen(string), 0); -} - -int main(void) { - /* Test FNV32_1 */ +/* Test FNV32_1 */ +void test_fnv32_1() { assert(fnv32_1_str("03SB[") == 0x00000000UL); assert(fnv32_1_str("") == 0x811c9dc5UL); assert(fnv32_1_str("a") == 0x050c5d7eUL); @@ -113,10 +45,49 @@ int main(void) { assert(fnv32_1_str("foob") == 0xb4b1178bUL); assert(fnv32_1_str("fooba") == 0xfdc80fb0UL); assert(fnv32_1_str("foobar") == 0x31f0b262UL); +} + - /* Test MurmurHash3 for x86, 32-bit */ - /* FIXME */ - printf("%x\n", Murmur3_32_str("The quick brown fox jumps over the lazy dog")); +/* Bloom filter data structure */ +const size_t bloom_bits = 64; +typedef uint64_t bloom_filter_type; +bloom_filter_type bloom_filter[1] = { 0 }; // XXX should calculate the size here - /* FIXME: bloom filter... */ +/* Returns the bit in the bloom filter to set/check. */ +size_t bloom_bit(char *string) { + /* Note: a real bloom filter would use multiple bits and multiple hash fns. */ + return fnv32_1_str(string) % bloom_bits; +} + +/* Add a value to the bloom filter. */ +void bloom_add(char *string) { + size_t i = bloom_bit(string) / sizeof(bloom_filter_type); + size_t j = bloom_bit(string) % sizeof(bloom_filter_type); + + bloom_filter[i] |= (1 << j); +} + +/* Check if a value might have been seen before. */ +bool bloom_check(char *string) { + size_t i = bloom_bit(string) / sizeof(bloom_filter_type); + size_t j = bloom_bit(string) % sizeof(bloom_filter_type); + + return (bloom_filter[i] & (1 << j)) != 0; +} + +/* Test filter */ +void test_bloom_filter() { + assert(bloom_check("foo") == false); + bloom_add("foo"); + assert(bloom_check("foo") == true); + + assert(bloom_check("bar") == false); /* assuming foo's hash values != bar's */ + bloom_add("bar"); + assert(bloom_check("bar") == true); +} + + +int main(void) { + test_fnv32_1(); + test_bloom_filter(); }