add an untested MurmurHash3 implementation

2014-04-12 15:44:37 +02:00 · 2014-04-12 15:44:37 +02:00 · 882ccc6dab
commit 882ccc6dab
parent db68f3d0ce
1 changed files with 76 additions and 0 deletions
--- a/bloom.c
+++ b/bloom.c
@ -1,5 +1,6 @@
 #include <assert.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

@ -28,6 +29,75 @@ uint32_t fnv32_1_str(char *string) {
  return fnv32_1((uint8_t *) string, strlen(string));
 }

+/* http://en.wikipedia.org/wiki/MurmurHash */
+uint32_t Murmur3_32(uint8_t * key, size_t len, uint32_t seed) {
+  // Note: In this version, all integer arithmetic is performed with unsigned 32 bit integers.
+  //       In the case of overflow, the result is constrained by the application of modulo 2^{32} arithmetic.
+  const uint32_t c1 = 0xcc9e2d51UL;
+  uint32_t c2 = 0x1b873593UL;
+  uint32_t r1 = 15;
+  uint32_t r2 = 13;
+  uint32_t m = 5;
+  uint32_t n = 0xe6546b64UL;
+
+  uint32_t hash = seed;
+
+  size_t i = 0;
+
+  /* For each four-byte chunk of key */
+  for (i = 0; i < len / 4; i++) {
+    /* FIXME endianness */
+    uint32_t k = (key[i + 0] << 0) | (key[i + 1] << 8) | (key[i + 2] << 16) | (key[i + 3] << 24);
+
+    k = k * c1;
+    k = (k << r1) | (k >> (32 - r1));
+    k = k * c2;
+
+    hash = hash ^ k;
+    hash = (hash << r2) | (hash >> (32 - r2));
+    hash = hash * m + n;
+  }
+
+  /* With any remaining bytes: */
+  if (len > i * 4) {
+    size_t remaininglen = len - i * 4;
+    uint32_t remainingbytes = 0;
+    for (size_t j = 0; j < remaininglen; j++) {
+      remainingbytes = remainingbytes << 8;
+      remainingbytes |= key[len - 1 - j];
+    }
+
+    // remainingbytes \gets SwapEndianOrderOf(remainingbytesInKey)
+    // Note: Endian swapping is only necessary on big-endian machines.
+    //       The purpose is to place the meaningful digits towards the low end of the value,
+    //       so that these digits have the greatest potential to affect the low range digits
+    //       in the subsequent multiplication.  Consider that locating the meaningful digits
+    //       in the high range would produce a greater effect upon the high digits of the
+    //       multiplication, and notably, that such high digits are likely to be discarded
+    //       by the modulo arithmetic under overflow.  We don't want that.
+    remainingbytes = remainingbytes * c1;
+    remainingbytes =
+        (remainingbytes << r1) | (remainingbytes >> (32 - r1));
+    remainingbytes = remainingbytes * c2;
+
+    hash = hash ^ remainingbytes;
+  }
+
+  hash = hash ^ len;
+
+  hash = hash ^ (hash >> 16);
+  hash = hash * 0x85ebca6b;
+  hash = hash ^ (hash >> 13);
+  hash = hash * 0xc2b2ae35;
+  hash = hash ^ (hash >> 16);
+
+  return hash;
+}
+
+uint32_t Murmur3_32_str(char *string) {
+  return Murmur3_32((uint8_t *) string, strlen(string), 0);
+}
+
 int main(void) {
  /* Test FNV32_1 */
  assert(fnv32_1_str("03SB[") == 0x00000000UL);
@ -43,4 +113,10 @@ int main(void) {
  assert(fnv32_1_str("foob") == 0xb4b1178bUL);
  assert(fnv32_1_str("fooba") == 0xfdc80fb0UL);
  assert(fnv32_1_str("foobar") == 0x31f0b262UL);
+
+  /* Test MurmurHash3 for x86, 32-bit */
+  /* FIXME */
+  printf("%x\n", Murmur3_32_str("The quick brown fox jumps over the lazy dog"));
+
+  /* FIXME: bloom filter... */
 }