From 882ccc6dabffb6ac15727568e6f6c0799738fdcf Mon Sep 17 00:00:00 2001 From: neingeist Date: Sat, 12 Apr 2014 15:44:37 +0200 Subject: [PATCH] add an untested MurmurHash3 implementation --- bloom.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/bloom.c b/bloom.c index 1f282a6..a7af646 100644 --- a/bloom.c +++ b/bloom.c @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -28,6 +29,75 @@ uint32_t fnv32_1_str(char *string) { return fnv32_1((uint8_t *) string, strlen(string)); } +/* http://en.wikipedia.org/wiki/MurmurHash */ +uint32_t Murmur3_32(uint8_t * key, size_t len, uint32_t seed) { + // Note: In this version, all integer arithmetic is performed with unsigned 32 bit integers. + // In the case of overflow, the result is constrained by the application of modulo 2^{32} arithmetic. + const uint32_t c1 = 0xcc9e2d51UL; + uint32_t c2 = 0x1b873593UL; + uint32_t r1 = 15; + uint32_t r2 = 13; + uint32_t m = 5; + uint32_t n = 0xe6546b64UL; + + uint32_t hash = seed; + + size_t i = 0; + + /* For each four-byte chunk of key */ + for (i = 0; i < len / 4; i++) { + /* FIXME endianness */ + uint32_t k = (key[i + 0] << 0) | (key[i + 1] << 8) | (key[i + 2] << 16) | (key[i + 3] << 24); + + k = k * c1; + k = (k << r1) | (k >> (32 - r1)); + k = k * c2; + + hash = hash ^ k; + hash = (hash << r2) | (hash >> (32 - r2)); + hash = hash * m + n; + } + + /* With any remaining bytes: */ + if (len > i * 4) { + size_t remaininglen = len - i * 4; + uint32_t remainingbytes = 0; + for (size_t j = 0; j < remaininglen; j++) { + remainingbytes = remainingbytes << 8; + remainingbytes |= key[len - 1 - j]; + } + + // remainingbytes \gets SwapEndianOrderOf(remainingbytesInKey) + // Note: Endian swapping is only necessary on big-endian machines. + // The purpose is to place the meaningful digits towards the low end of the value, + // so that these digits have the greatest potential to affect the low range digits + // in the subsequent multiplication. Consider that locating the meaningful digits + // in the high range would produce a greater effect upon the high digits of the + // multiplication, and notably, that such high digits are likely to be discarded + // by the modulo arithmetic under overflow. We don't want that. + remainingbytes = remainingbytes * c1; + remainingbytes = + (remainingbytes << r1) | (remainingbytes >> (32 - r1)); + remainingbytes = remainingbytes * c2; + + hash = hash ^ remainingbytes; + } + + hash = hash ^ len; + + hash = hash ^ (hash >> 16); + hash = hash * 0x85ebca6b; + hash = hash ^ (hash >> 13); + hash = hash * 0xc2b2ae35; + hash = hash ^ (hash >> 16); + + return hash; +} + +uint32_t Murmur3_32_str(char *string) { + return Murmur3_32((uint8_t *) string, strlen(string), 0); +} + int main(void) { /* Test FNV32_1 */ assert(fnv32_1_str("03SB[") == 0x00000000UL); @@ -43,4 +113,10 @@ int main(void) { assert(fnv32_1_str("foob") == 0xb4b1178bUL); assert(fnv32_1_str("fooba") == 0xfdc80fb0UL); assert(fnv32_1_str("foobar") == 0x31f0b262UL); + + /* Test MurmurHash3 for x86, 32-bit */ + /* FIXME */ + printf("%x\n", Murmur3_32_str("The quick brown fox jumps over the lazy dog")); + + /* FIXME: bloom filter... */ }