sort of finish bloom filter

2014-04-20 17:10:07 +02:00 · 2014-04-20 17:10:07 +02:00 · 90e69a4c54
commit 90e69a4c54
parent a5638de011
1 changed files with 48 additions and 77 deletions
--- a/bloom.c
+++ b/bloom.c
@ -1,4 +1,5 @@
 #include <assert.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -29,77 +30,8 @@ uint32_t fnv32_1_str(char *string) {
  return fnv32_1((uint8_t *) string, strlen(string));
 }

-/* http://en.wikipedia.org/wiki/MurmurHash */
-uint32_t Murmur3_32(uint8_t * key, size_t len, uint32_t seed) {
-  // Note: In this version, all integer arithmetic is performed with unsigned 32 bit integers.
-  //       In the case of overflow, the result is constrained by the application of modulo 2^{32} arithmetic.
-  const uint32_t c1 = 0xcc9e2d51UL;
-  uint32_t c2 = 0x1b873593UL;
-  uint32_t r1 = 15;
-  uint32_t r2 = 13;
-  uint32_t m = 5;
-  uint32_t n = 0xe6546b64UL;
-
-  uint32_t hash = seed;
-
-  size_t i = 0;
-
-  /* For each four-byte chunk of key */
-  for (i = 0; i < len / 4; i++) {
-    /* FIXME endianness */
-    uint32_t k = (key[i + 0] << 0) | (key[i + 1] << 8) | (key[i + 2] << 16) | (key[i + 3] << 24);
-
-    k = k * c1;
-    k = (k << r1) | (k >> (32 - r1));
-    k = k * c2;
-
-    hash = hash ^ k;
-    hash = (hash << r2) | (hash >> (32 - r2));
-    hash = hash * m + n;
-  }
-
-  /* With any remaining bytes: */
-  if (len > i * 4) {
-    size_t remaininglen = len - i * 4;
-    uint32_t remainingbytes = 0;
-    for (size_t j = 0; j < remaininglen; j++) {
-      remainingbytes = remainingbytes << 8;
-      remainingbytes |= key[len - 1 - j];
-    }
-
-    // remainingbytes \gets SwapEndianOrderOf(remainingbytesInKey)
-    // Note: Endian swapping is only necessary on big-endian machines.
-    //       The purpose is to place the meaningful digits towards the low end of the value,
-    //       so that these digits have the greatest potential to affect the low range digits
-    //       in the subsequent multiplication.  Consider that locating the meaningful digits
-    //       in the high range would produce a greater effect upon the high digits of the
-    //       multiplication, and notably, that such high digits are likely to be discarded
-    //       by the modulo arithmetic under overflow.  We don't want that.
-    remainingbytes = remainingbytes * c1;
-    remainingbytes =
-      (remainingbytes << r1) | (remainingbytes >> (32 - r1));
-    remainingbytes = remainingbytes * c2;
-
-    hash = hash ^ remainingbytes;
-  }
-
-  hash = hash ^ len;
-
-  hash = hash ^ (hash >> 16);
-  hash = hash * 0x85ebca6b;
-  hash = hash ^ (hash >> 13);
-  hash = hash * 0xc2b2ae35;
-  hash = hash ^ (hash >> 16);
-
-  return hash;
-}
-
-uint32_t Murmur3_32_str(char *string) {
-  return Murmur3_32((uint8_t *) string, strlen(string), 0);
-}
-
-int main(void) {
 /* Test FNV32_1 */
+void test_fnv32_1() {
  assert(fnv32_1_str("03SB[") == 0x00000000UL);
  assert(fnv32_1_str("") == 0x811c9dc5UL);
  assert(fnv32_1_str("a") == 0x050c5d7eUL);
@ -113,10 +45,49 @@ int main(void) {
  assert(fnv32_1_str("foob") == 0xb4b1178bUL);
  assert(fnv32_1_str("fooba") == 0xfdc80fb0UL);
  assert(fnv32_1_str("foobar") == 0x31f0b262UL);
-
-  /* Test MurmurHash3 for x86, 32-bit */
-  /* FIXME */
-  printf("%x\n", Murmur3_32_str("The quick brown fox jumps over the lazy dog"));
-
-  /* FIXME: bloom filter... */
+}
+
+
+/* Bloom filter data structure */
+const size_t bloom_bits = 64;
+typedef uint64_t bloom_filter_type;
+bloom_filter_type bloom_filter[1] = { 0 }; // XXX should calculate the size here
+
+/* Returns the bit in the bloom filter to set/check. */
+size_t bloom_bit(char *string) {
+  /* Note: a real bloom filter would use multiple bits and multiple hash fns. */
+  return fnv32_1_str(string) % bloom_bits;
+}
+
+/* Add a value to the bloom filter. */
+void bloom_add(char *string) {
+  size_t i = bloom_bit(string) / sizeof(bloom_filter_type);
+  size_t j = bloom_bit(string) % sizeof(bloom_filter_type);
+
+  bloom_filter[i] |= (1 << j);
+}
+
+/* Check if a value might have been seen before. */
+bool bloom_check(char *string) {
+  size_t i = bloom_bit(string) / sizeof(bloom_filter_type);
+  size_t j = bloom_bit(string) % sizeof(bloom_filter_type);
+
+  return (bloom_filter[i] & (1 << j)) != 0;
+}
+
+/* Test filter */
+void test_bloom_filter() {
+  assert(bloom_check("foo") == false);
+  bloom_add("foo");
+  assert(bloom_check("foo") == true);
+
+  assert(bloom_check("bar") == false); /* assuming foo's hash values != bar's */
+  bloom_add("bar");
+  assert(bloom_check("bar") == true);
+}
+
+
+int main(void) {
+  test_fnv32_1();
+  test_bloom_filter();
 }