libraries/murmur2/src/MurmurHash2.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

//-----------------------------------------------------------------------------
// MurmurHash2 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
//
// This was modified as to possibilitate it's usage incrementally.
// Those modifications are also placed in the public domain, and the author of
// such modifications hereby disclaims copyright to this source code.

#include "MurmurHash2.h"

//-----------------------------------------------------------------------------

// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const uint32_t m = 0x5bd1e995;
const int r = 24;

uint32_t MurmurHash2(std::ifstream&& file_stream, std::size_t buffer_size, std::function<bool(char)> filter_out)
{
    auto* buffer = new char[buffer_size];
    char data[4];

    int read = 0;
    uint32_t size = 0;

    // We need the size without the filtered out characters before actually calculating the hash,
    // to setup the initial value for the hash.
    do {
        file_stream.read(buffer, buffer_size);
        read = file_stream.gcount();
        for (int i = 0; i < read; i++) {
            if (!filter_out(buffer[i]))
                size += 1;
        }
    } while (!file_stream.eof());

    file_stream.clear();
    file_stream.seekg(0, file_stream.beg);

    int index = 0;

    // This forces a seed of 1.
    IncrementalHashInfo info{ (uint32_t)1 ^ size, (uint32_t)size };
    do {
        file_stream.read(buffer, buffer_size);
        read = file_stream.gcount();
        for (int i = 0; i < read; i++) {
            char c = buffer[i];

            if (filter_out(c))
                continue;

            data[index] = c;
            index = (index + 1) % 4;

            // Mix 4 bytes at a time into the hash
            if (index == 0)
                FourBytes_MurmurHash2(reinterpret_cast<unsigned char*>(&data), info);
        }
    } while (!file_stream.eof());

    // Do one last bit shuffle in the hash
    FourBytes_MurmurHash2(reinterpret_cast<unsigned char*>(&data), info);

    delete[] buffer;

    file_stream.close();
    return info.h;
}

void FourBytes_MurmurHash2(const unsigned char* data, IncrementalHashInfo& prev)
{
    if (prev.len >= 4) {
        // Not the final mix
        uint32_t k = *reinterpret_cast<const uint32_t*>(data);

        k *= m;
        k ^= k >> r;
        k *= m;

        prev.h *= m;
        prev.h ^= k;

        prev.len -= 4;
    } else {
        // The final mix

        // Handle the last few bytes of the input array
        switch (prev.len) {
            case 3:
                prev.h ^= data[2] << 16;
            case 2:
                prev.h ^= data[1] << 8;
            case 1:
                prev.h ^= data[0];
                prev.h *= m;
        };

        // Do a few final mixes of the hash to ensure the last few
        // bytes are well-incorporated.

        prev.h ^= prev.h >> 13;
        prev.h *= m;
        prev.h ^= prev.h >> 15;

        prev.len = 0;
    }
}

//-----------------------------------------------------------------------------