feat(libs): add incremental version of murmurhash2 calculation

This does two passes for a given file, which is kinda slow, but I don't know how else to get the size excluding the filtered ones :< Signed-off-by: flow <flowlnlnln@gmail.com>
author: flow <flowlnlnln@gmail.com> 2022-07-23 23:14:49 -0300
committer: flow <flowlnlnln@gmail.com> 2022-07-24 17:46:53 -0300
commit: f95bcf45ad0e537a124f5aa0c7b9e97a78811289 (patch)
tree: ef501d49352200ad76671093fe1f9ad82ea8afa2 /libraries
parent: 15ec1abb6a3acb77b36f14d3ddccc97a8df8c8e1 (diff)
download: PrismLauncher-f95bcf45ad0e537a124f5aa0c7b9e97a78811289.tar.gz
PrismLauncher-f95bcf45ad0e537a124f5aa0c7b9e97a78811289.tar.bz2
PrismLauncher-f95bcf45ad0e537a124f5aa0c7b9e97a78811289.zip
2 files changed, 117 insertions, 93 deletions
diff --git a/libraries/murmur2/src/MurmurHash2.cpp b/libraries/murmur2/src/MurmurHash2.cpp
index 3e52e6d1..b625efb1 100644
--- a/libraries/murmur2/src/MurmurHash2.cpp
+++ b/libraries/murmur2/src/MurmurHash2.cpp
@@ -1,86 +1,110 @@
 //-----------------------------------------------------------------------------
 // MurmurHash2 was written by Austin Appleby, and is placed in the public
 // domain. The author hereby disclaims copyright to this source code.
-
-// Note - This code makes a few assumptions about how your machine behaves -
-
-// 1. We can read a 4-byte value from any address without crashing
-// 2. sizeof(int) == 4
-
-// And it has a few limitations -
-
-// 1. It will not work incrementally.
-// 2. It will not produce the same results on little-endian and big-endian
-//    machines.
+//
+// This was modified as to possibilitate it's usage incrementally.
+// Those modifications are also placed in the public domain, and the author of
+// such modifications hereby disclaims copyright to this source code.
 
 #include "MurmurHash2.h"
 
 //-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define BIG_CONSTANT(x) (x)
 
-// Other compilers
+// 'm' and 'r' are mixing constants generated offline.
+// They're not really 'magic', they just happen to work well.
+const uint32_t m = 0x5bd1e995;
+const int r = 24;
 
-#else	// defined(_MSC_VER)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-
-uint64_t MurmurHash2 ( const void* key, int len, uint32_t seed )
+uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out)
 {
-  // 'm' and 'r' are mixing constants generated offline.
-  // They're not really 'magic', they just happen to work well.
-
-  const uint32_t m = 0x5bd1e995;
-  const int r = 24;
-
-  // Initialize the hash to a 'random' value
-
-  uint32_t h = seed ^ len;
-
-  // Mix 4 bytes at a time into the hash
-  const auto* data = (const unsigned char*) key;
-  while(len >= 4)
-  {
-    uint32_t k = *(uint32_t*)data;
-
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-
-    h *= m;
-    h ^= k;
-
-    data += 4*sizeof(char);
-    len -= 4;
-  }
-
-  // Handle the last few bytes of the input array
-
-  switch(len)
-  {
-  case 3: h ^= data[2] << 16;
-  case 2: h ^= data[1] << 8;
-  case 1: h ^= data[0];
-      h *= m;
-  };
-
-  // Do a few final mixes of the hash to ensure the last few
-  // bytes are well-incorporated.
-
-  h ^= h >> 13;
-  h *= m;
-  h ^= h >> 15;
-
-  return h;
-} 
+    auto* buffer = new char[buffer_size];
+    char data[4];
+
+    int read = 0;
+    uint32_t size = 0;
+
+    // We need the size without the filtered out characters before actually calculating the hash,
+    // to setup the initial value for the hash.
+    do {
+        file_stream.read(buffer, buffer_size);
+        read = file_stream.gcount();
+        for (int i = 0; i < read; i++) {
+            if (!filter_out(buffer[i]))
+                size += 1;
+        }
+    } while (!file_stream.eof());
+
+    file_stream.clear();
+    file_stream.seekg(0, file_stream.beg);
+
+    int index = 0;
+
+    // This forces a seed of 1.
+    IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size };
+    do {
+        file_stream.read(buffer, buffer_size);
+        read = file_stream.gcount();
+        for (int i = 0; i < read; i++) {
+            char c = buffer[i];
+
+            if (filter_out(c))
+                continue;
+
+            data[index] = c;
+            index = (index + 1) % 4;
+
+            // Mix 4 bytes at a time into the hash
+            if (index == 0)
+                FourBytes_MurmurHash2((unsigned char*)&data, info);
+        }
+    } while (!file_stream.eof());
+
+    // Do one last bit shuffle in the hash
+    FourBytes_MurmurHash2((unsigned char*)&data, info);
+
+    delete[] buffer;
+
+    file_stream.close();
+    return info.h;
+}
+
+void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev)
+{
+    if (prev.len >= 4) {
+        // Not the final mix
+        uint32_t k = *(uint32_t*)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        prev.h *= m;
+        prev.h ^= k;
+
+        prev.len -= 4;
+    } else {
+        // The final mix
+
+        // Handle the last few bytes of the input array
+        switch (prev.len) {
+            case 3:
+                prev.h ^= data[2] << 16;
+            case 2:
+                prev.h ^= data[1] << 8;
+            case 1:
+                prev.h ^= data[0];
+                prev.h *= m;
+        };
+
+        // Do a few final mixes of the hash to ensure the last few
+        // bytes are well-incorporated.
+
+        prev.h ^= prev.h >> 13;
+        prev.h *= m;
+        prev.h ^= prev.h >> 15;
+
+        prev.len = 0;
+    }
+}
 
 //-----------------------------------------------------------------------------
diff --git a/libraries/murmur2/src/MurmurHash2.h b/libraries/murmur2/src/MurmurHash2.h
index c7b83bca..acea27ea 100644
--- a/libraries/murmur2/src/MurmurHash2.h
+++ b/libraries/murmur2/src/MurmurHash2.h
@@ -1,30 +1,30 @@
 //-----------------------------------------------------------------------------
-// MurmurHash2 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
+// The original MurmurHash2 was written by Austin Appleby, and is placed in the
+// public domain. The author hereby disclaims copyright to this source code.
+//
+// This was modified as to possibilitate it's usage incrementally.
+// Those modifications are also placed in the public domain, and the author of
+// such modifications hereby disclaims copyright to this source code.
 
 #pragma once
 
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
+#include <cstdint>
+#include <fstream>
 
-// Other compilers
+#include <functional>
 
-#else	// defined(_MSC_VER)
-
-#include <stdint.h>
+//-----------------------------------------------------------------------------
 
-#endif // !defined(_MSC_VER)
+uint32_t MurmurHash2(
+    std::ifstream&& file_stream,
+    std::size_t buffer_size = 4096,
+    std::function<bool(char)> filter_out = [](char) { return true; });
 
-//-----------------------------------------------------------------------------
+struct IncrementalHashInfo {
+    uint32_t h;
+    uint32_t len;
+};
 
-uint64_t MurmurHash2        ( const void* key, int len, uint32_t seed = 1 );
+void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev);
 
 //-----------------------------------------------------------------------------
author	flow <flowlnlnln@gmail.com>	2022-07-23 23:14:49 -0300
committer	flow <flowlnlnln@gmail.com>	2022-07-24 17:46:53 -0300
commit	f95bcf45ad0e537a124f5aa0c7b9e97a78811289 (patch)
tree	ef501d49352200ad76671093fe1f9ad82ea8afa2 /libraries
parent	15ec1abb6a3acb77b36f14d3ddccc97a8df8c8e1 (diff)
download	PrismLauncher-f95bcf45ad0e537a124f5aa0c7b9e97a78811289.tar.gz PrismLauncher-f95bcf45ad0e537a124f5aa0c7b9e97a78811289.tar.bz2 PrismLauncher-f95bcf45ad0e537a124f5aa0c7b9e97a78811289.zip