aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/bartworks/util/MurmurHash3.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/bartworks/util/MurmurHash3.java')
-rw-r--r--src/main/java/bartworks/util/MurmurHash3.java306
1 files changed, 306 insertions, 0 deletions
diff --git a/src/main/java/bartworks/util/MurmurHash3.java b/src/main/java/bartworks/util/MurmurHash3.java
new file mode 100644
index 0000000000..825c83cca2
--- /dev/null
+++ b/src/main/java/bartworks/util/MurmurHash3.java
@@ -0,0 +1,306 @@
+/*
+ * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. This java port was authored
+ * by Yonik Seeley and also placed into the public domain. The author hereby disclaims copyright to this source code.
+ * <p> This produces exactly the same hash values as the final C++ version of MurmurHash3 and is thus suitable for
+ * producing the same hash values across platforms. <p> The 32 bit x86 version of this hash should be the fastest
+ * variant for relatively short keys like ids. murmurhash3_x64_128 is a good choice for longer strings or if you need
+ * more than 32 bits of hash. <p> Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms
+ * are optimized for their respective platforms. <p> See http://github.com/yonik/java_util for future updates to this
+ * file. Special Thanks to Austin Appleby and Yonik Seeley for placing this in the public domain and therefore allowing
+ * me to use it!
+ */
+package bartworks.util;
+
+public final class MurmurHash3 {
+
+ public static int fmix32(int h) {
+ h ^= h >>> 16;
+ h *= 0x85ebca6b;
+ h ^= h >>> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >>> 16;
+ return h;
+ }
+
+ public static long fmix64(long k) {
+ k ^= k >>> 33;
+ k *= 0xff51afd7ed558ccdL;
+ k ^= k >>> 33;
+ k *= 0xc4ceb9fe1a85ec53L;
+ k ^= k >>> 33;
+ return k;
+ }
+
+ /**
+ * Gets a long from a byte buffer in little endian byte order.
+ */
+ public static long getLongLittleEndian(byte[] buf, int offset) {
+ return (long) buf[offset + 7] << 56 // no mask needed
+ | (buf[offset + 6] & 0xffL) << 48
+ | (buf[offset + 5] & 0xffL) << 40
+ | (buf[offset + 4] & 0xffL) << 32
+ | (buf[offset + 3] & 0xffL) << 24
+ | (buf[offset + 2] & 0xffL) << 16
+ | (buf[offset + 1] & 0xffL) << 8
+ | buf[offset] & 0xffL; // no shift needed
+ }
+
+ /**
+ * Returns the MurmurHash3_x86_32 hash.
+ */
+ public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {
+
+ final int c1 = 0xcc9e2d51;
+ final int c2 = 0x1b873593;
+
+ int h1 = seed;
+ int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block
+
+ for (int i = offset; i < roundedEnd; i += 4) {
+ // little endian load order
+ int k1 = data[i] & 0xff | (data[i + 1] & 0xff) << 8 | (data[i + 2] & 0xff) << 16 | data[i + 3] << 24;
+ k1 *= c1;
+ k1 = k1 << 15 | k1 >>> 17; // ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = h1 << 13 | h1 >>> 19; // ROTL32(h1,13);
+ h1 = h1 * 5 + 0xe6546b64;
+ }
+
+ // tail
+ int k1 = 0;
+
+ switch (len & 0x03) {
+ case 3:
+ k1 = (data[roundedEnd + 2] & 0xff) << 16;
+ // fallthrough
+ case 2:
+ k1 |= (data[roundedEnd + 1] & 0xff) << 8;
+ // fallthrough
+ case 1:
+ k1 |= data[roundedEnd] & 0xff;
+ k1 *= c1;
+ k1 = k1 << 15 | k1 >>> 17; // ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ // finalization
+ h1 ^= len;
+
+ // fmix(h1);
+ h1 ^= h1 >>> 16;
+ h1 *= 0x85ebca6b;
+ h1 ^= h1 >>> 13;
+ h1 *= 0xc2b2ae35;
+ h1 ^= h1 >>> 16;
+
+ return h1;
+ }
+
+ /**
+ * Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding the string to a
+ * temporary buffer. This is more than 2x faster than hashing the result of String.getBytes().
+ */
+ public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {
+
+ final int c1 = 0xcc9e2d51;
+ final int c2 = 0x1b873593;
+
+ int h1 = seed;
+
+ int pos = offset;
+ int end = offset + len;
+ int k1 = 0;
+ int k2 = 0;
+ int shift = 0;
+ int bits = 0;
+ int nBytes = 0; // length in UTF8 bytes
+
+ while (pos < end) {
+ int code = data.charAt(pos);
+ pos++;
+ if (code < 0x80) {
+ k2 = code;
+ bits = 8;
+
+ /***
+ * // optimized ascii implementation (currently slower!!! code size?) if (shift == 24) { k1 = k1 | (code
+ * << 24);
+ *
+ * k1 *= c1; k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); k1 *= c2;
+ *
+ * h1 ^= k1; h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); h1 = h1*5+0xe6546b64;
+ *
+ * shift = 0; nBytes += 4; k1 = 0; } else { k1 |= code << shift; shift += 8; } continue;
+ ***/
+ } else if (code < 0x800) {
+ k2 = 0xC0 | code >> 6 | (0x80 | code & 0x3F) << 8;
+ bits = 16;
+ } else if (code < 0xD800 || code > 0xDFFF || pos >= end) {
+ // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
+ k2 = 0xE0 | code >> 12 | (0x80 | code >> 6 & 0x3F) << 8 | (0x80 | code & 0x3F) << 16;
+ bits = 24;
+ } else {
+ // surrogate pair
+ // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
+ int utf32 = data.charAt(pos++);
+ utf32 = (code - 0xD7C0 << 10) + (utf32 & 0x3FF);
+ k2 = 0xff & (0xF0 | utf32 >> 18) | (0x80 | utf32 >> 12 & 0x3F) << 8
+ | (0x80 | utf32 >> 6 & 0x3F) << 16
+ | (0x80 | utf32 & 0x3F) << 24;
+ bits = 32;
+ }
+
+ k1 |= k2 << shift;
+
+ shift += bits;
+ if (shift >= 32) {
+ // mix after we have a complete word
+
+ k1 *= c1;
+ k1 = k1 << 15 | k1 >>> 17; // ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = h1 << 13 | h1 >>> 19; // ROTL32(h1,13);
+ h1 = h1 * 5 + 0xe6546b64;
+
+ shift -= 32;
+ // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
+ if (shift != 0) {
+ k1 = k2 >>> bits - shift; // bits used == bits - newshift
+ } else {
+ k1 = 0;
+ }
+ nBytes += 4;
+ }
+ } // inner
+
+ // handle tail
+ if (shift > 0) {
+ nBytes += shift >> 3;
+ k1 *= c1;
+ k1 = k1 << 15 | k1 >>> 17; // ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ // finalization
+ h1 ^= nBytes;
+
+ // fmix(h1);
+ h1 ^= h1 >>> 16;
+ h1 *= 0x85ebca6b;
+ h1 ^= h1 >>> 13;
+ h1 *= 0xc2b2ae35;
+ h1 ^= h1 >>> 16;
+
+ return h1;
+ }
+
+ /**
+ * Returns the MurmurHash3_x64_128 hash, placing the result in "out".
+ */
+ public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) {
+ // The original algorithm does have a 32 bit unsigned seed.
+ // We have to mask to match the behavior of the unsigned types and prevent sign extension.
+ long h1 = seed & 0x00000000FFFFFFFFL;
+ long h2 = seed & 0x00000000FFFFFFFFL;
+
+ final long c1 = 0x87c37b91114253d5L;
+ final long c2 = 0x4cf5ad432745937fL;
+
+ int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block
+ for (int i = offset; i < roundedEnd; i += 16) {
+ long k1 = getLongLittleEndian(key, i);
+ long k2 = getLongLittleEndian(key, i + 8);
+ k1 *= c1;
+ k1 = Long.rotateLeft(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ h1 = Long.rotateLeft(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+ k2 *= c2;
+ k2 = Long.rotateLeft(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ h2 = Long.rotateLeft(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ long k1 = 0;
+ long k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 = (key[roundedEnd + 14] & 0xffL) << 48;
+ case 14:
+ k2 |= (key[roundedEnd + 13] & 0xffL) << 40;
+ case 13:
+ k2 |= (key[roundedEnd + 12] & 0xffL) << 32;
+ case 12:
+ k2 |= (key[roundedEnd + 11] & 0xffL) << 24;
+ case 11:
+ k2 |= (key[roundedEnd + 10] & 0xffL) << 16;
+ case 10:
+ k2 |= (key[roundedEnd + 9] & 0xffL) << 8;
+ case 9:
+ k2 |= key[roundedEnd + 8] & 0xffL;
+ k2 *= c2;
+ k2 = Long.rotateLeft(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+ case 8:
+ k1 = (long) key[roundedEnd + 7] << 56;
+ case 7:
+ k1 |= (key[roundedEnd + 6] & 0xffL) << 48;
+ case 6:
+ k1 |= (key[roundedEnd + 5] & 0xffL) << 40;
+ case 5:
+ k1 |= (key[roundedEnd + 4] & 0xffL) << 32;
+ case 4:
+ k1 |= (key[roundedEnd + 3] & 0xffL) << 24;
+ case 3:
+ k1 |= (key[roundedEnd + 2] & 0xffL) << 16;
+ case 2:
+ k1 |= (key[roundedEnd + 1] & 0xffL) << 8;
+ case 1:
+ k1 |= key[roundedEnd] & 0xffL;
+ k1 *= c1;
+ k1 = Long.rotateLeft(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ // ----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ out.val1 = h1;
+ out.val2 = h2;
+ }
+
+ /**
+ * 128 bits of state
+ */
+ public static final class LongPair {
+
+ public long val1;
+ public long val2;
+ }
+}