Mstdlib-1.24.0
m_hashtable.h
1/* The MIT License (MIT)
2 *
3 * Copyright (c) 2015 Monetra Technologies, LLC.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 * THE SOFTWARE.
22 */
23
24#ifndef __M_HASHTABLE_H__
25#define __M_HASHTABLE_H__
26
27/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
28
29#include <mstdlib/base/m_defs.h>
30#include <mstdlib/base/m_types.h>
31#include <mstdlib/base/m_mem.h>
32#include <mstdlib/base/m_math.h>
33#include <mstdlib/base/m_sort.h>
34#include <mstdlib/base/m_llist.h>
35
36/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
37
38__BEGIN_DECLS
39
40/* TODO:
41 * Additional functions:
42 * + M_list_t *get_keys();
43 * + M_bool is_muli_value();
44 */
45
46/*! \defgroup m_hashtable Hashtable
47 * \ingroup m_datastructures
48 * Hashtable, meant for storing key/value pairs with O(1) insertion, deletion, and search time
49 */
50
51/*! \addtogroup m_hashtable_generic Hashtable generic/base implementation
52 * \ingroup m_hashtable
53 *
54 * Hashtable, meant for storing key/value pairs.
55 *
56 * This should not be used directly. It is a base implementation that should
57 * be used by a type safe wrapper. For example: M_hash_dict.
58 *
59 * The h can uses a set of callback functions to determine behavior. Such
60 * as if it should duplicate or free values.
61 *
62 * An optional hash algorithm can be specified when creating a type safe wrapper.
63 * It is highly recommended to provide a hash algorithm. The default algorithm
64 * is an FNV1a variant using the pointer of the key.
65 *
66 * The currently provided wrappers (str and u64) use an FNV1a variant. Multiple
67 * hashing algorithms were considered but FNV1a was ultimately chosen because testing
68 * with real world data sets it was found to provide the best performance.
69 *
70 * The following hash functions were evaluated:
71 * - FNV1
72 * - FNV1a
73 * - Lookup2
74 * - Qt4's hash function
75 * - djb2
76 *
77 * Overall performance was tested. We looked at time to generate the hash,
78 * time for insert, and lookup time. The insert and lookup are
79 * specific to see how chaining due to increased collisions impacted overall
80 * performance.
81 *
82 * FNV1a had average collision performance and average hash time. Some hash
83 * functions had fewer collisions but the time it took to generate the hash
84 * far exceeded the chaining time. Others had very fast generation time but had
85 * so many collisions that the chaining time exceeded the benefit of being quick.
86 *
87 * FNV1a was found to have few enough collisions to keep any chains sort and the
88 * combined hash generation and chaining time (when chaining happened) was overall
89 * faster than the other hash's times.
90 *
91 * In order to prevent denial of service attacks by an attacker causing generation
92 * of extremely large chains FNV1a was modified. A random hash seed that is unique
93 * per hashtable object (each hashtable created using _create(...)) is used as
94 * the offset bias for the algorithm.
95 *
96 * According to draft-eastlake-fnv-09
97 * at https://tools.ietf.org/html/draft-eastlake-fnv-09#section-2.2 .
98 * "In the general case, almost any offset_basis will serve so long as it is non-zero."
99 * This information can also be found on Noll's website
100 * http://isthe.com/chongo/tech/comp/fnv/index.html in the section,
101 * "Parameters of the FNV-1/FNV-1a hash".
102 *
103 * In our variation care has been taken to ensure the bias is never 0.
104 *
105 * The random seed is created using M_rand. While M_rand is not a secure random
106 * number generator the random seed for M_rand is created from unlikely to be known
107 * data such as stack and heap memory addresses at the time the hashtable is created.
108 * It is unlikely an attacker would be able to determine the random seed to be able to
109 * get the hash seed. Nor is it likely for an attacker to be able to determine the
110 * hash seed. Testing using a random hash seed was found to alleviate chaining attacks.
111 *
112 * @{
113 */
114
115/* Hashes are M_uint32 meaning we can only have that many buckets. We can
116 * have more than that many items due to chaining where a bucket will have
117 * multiple items chained together. */
118#define M_HASHTABLE_MAX_BUCKETS (1U<<24)
119
120/* ------- Semi-Public types ------- */
121
122struct M_hashtable;
123typedef struct M_hashtable M_hashtable_t;
124
125
126/*! State tracking object for enumerating a Hashtable. This is explicitly
127 * not hidden so it doesn't require a malloc() */
129 union {
130 struct {
131 M_uint32 hash; /*!< Hash of last processed entry */
132 size_t chainid; /*!< 1-based offset within linked list of clashes of last
133 processed entry. This value is 1-based specifically
134 so when starting an enumeration, a 0,0 value would
135 indicate this */
136 } unordered;
137 struct {
138 M_llist_node_t *keynode; /*!< When ordered keys are in use this is the node of the key
139 currently being processed. */
140 } ordered;
142 size_t valueidx; /*!< When multi-value is in use which index of next value. */
143};
144typedef struct M_hashtable_enum M_hashtable_enum_t;
145
146/*! Function definition for callback to hash a key */
147typedef M_uint32 (*M_hashtable_hash_func)(const void *, M_uint32);
148
149/*! Function definition to duplicate a key or value */
150typedef void *(*M_hashtable_duplicate_func)(const void *);
151
152/*! Function definition to free a key or value */
153typedef void (*M_hashtable_free_func)(void *);
154
155
156/*! Structure of callbacks that can be registered to override default
157 * behavior for h implementation.
158 *
159 * This allows a great deal of flexibility. For instance, you may want
160 * the HashTable to take ownership of the 'value' passed to it and clean
161 * up when the entry is replaced, removed, or the h is destroyed.
162 * In this implementation, you could use NULL for 'value_duplicate' so
163 * the pointer passed in is used directly, but register an appropriate
164 * 'value_free' to auto-cleanup.
165 *
166 * Note that there are two duplicate callbacks for keys and values.
167 * There are two times a key or value can be duplicated. When it is
168 * first inserted into the h and when the h itself
169 * is duplicated.
170 *
171 * In some cases the key or value needs to be duplicated by the h
172 * wrapper instead of by the base itself. For example storing unbounded
173 * binary data as a value. To prevent extra allocations and additional
174 * wrapping the value is duplicated by the wrapper and the length is
175 * prepended. This duplicate needs the length in order to work where
176 * the other duplicate (copy of h) will get the length from
177 * the fist few bytes of the value itself.
178 */
180 M_hashtable_duplicate_func key_duplicate_insert; /*!< Callback to duplicate a key on insert. Default if
181 * NULL is pass-thru pointer */
182 M_hashtable_duplicate_func key_duplicate_copy; /*!< Callback to duplicate a key on copy. Default if
183 * NULL is pass-thru pointer */
184 M_hashtable_free_func key_free; /*!< Callback to free a key. Default if NULL
185 * is no-op */
186 M_hashtable_duplicate_func value_duplicate_insert; /*!< Callback to duplicate a value on insert. Default
187 * if NULL is pass-thru pointer */
188 M_hashtable_duplicate_func value_duplicate_copy; /*!< Callback to duplicate a value on copy. Default
189 * if NULL is pass-thru pointer */
190 M_sort_compar_t value_equality; /*!< Callback used to determine if two values are equal.
191 Primarily used for sorting muli-values stores. Default
192 is all values are equal. */
193 M_hashtable_free_func value_free; /*!< Callback to free a value. Default if
194 * NULL is a no-op */
195};
196
197/*! Flags for controlling the behavior of the hash */
198typedef enum {
199 M_HASHTABLE_NONE = 0, /*!< Case sensitive single value (new values replace). */
200 M_HASHTABLE_KEYS_ORDERED = 1 << 0, /*!< Keys should be ordered. Default is insertion order unless the
201 sorted option is specified. */
202 M_HASHTABLE_KEYS_SORTED = 1 << 1, /*!< When the keys are ordered sort them using the key_equality function. */
203 M_HASHTABLE_MULTI_VALUE = 1 << 2, /*!< Allow keys to contain multiple values.
204 Sorted in insertion order another sorting is specified. */
205 M_HASHTABLE_MULTI_SORTED = 1 << 3, /*!< Allow keys to contain multiple values sorted in ascending order */
206 M_HASHTABLE_MULTI_GETLAST = 1 << 4, /*!< When using the get function will get the last value from the list
207 when allowing multiple values. The default is to get the first value. */
208 M_HASHTABLE_STATIC_SEED = 1 << 5 /*!< Use a static seed for hash function initialization. This greatly reduces
209 the security of the hashtable and removes collision attack protections.
210 This should only be used as a performance optimization when creating
211 millions of hashtables with static data specifically for quick look up.
212 DO _NOT_ use this flag with any hashtable that could store user
213 generated data! Be very careful about duplicating a hashtable that
214 was created with this flag. All duplicates will use the static seed. */
216
217/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
218
219/*! Create a new h.
220 *
221 * The h will pre-allocate an array of buckets based on the rounded up size specified. Any hash collisions
222 * will result in those collisions being chained together via a linked list. The h will auto-expand by a
223 * power of 2 when the fill percentage specified is reached. All key entries are compared in a case-insensitive
224 * fashion, and are duplicated internally. Values are duplicated. Case is preserved for both keys and values.
225 *
226 * \param[in] size Size of the hash table. If not specified as a power of 2, will
227 * be rounded up to the nearest power of 2.
228 * \param[in] fillpct The maximum fill percentage before the hash table is expanded. If
229 * 0 is specified, the h will never expand, otherwise the
230 * value must be between 1 and 99 (recommended: 75).
231 * \param[in] key_hash The function to use for hashing a key. If not specified will use
232 * the pointer address as the key and use FNV1a.
233 * \param[in] key_equality The function to use to determine if two keys are equal. If not
234 * specified, will compare pointer addresses.
235 * \param[in] flags M_hash_strvp_flags_t flags for modifying behavior.
236 * \param[in] callbacks Register callbacks for overriding default behavior.
237 *
238 * \return Allocated h.
239 *
240 * \see M_hashtable_destroy
241 */
242M_API M_hashtable_t *M_hashtable_create(size_t size, M_uint8 fillpct,
243 M_hashtable_hash_func key_hash, M_sort_compar_t key_equality,
244 M_uint32 flags, const struct M_hashtable_callbacks *callbacks) M_MALLOC;
245
246
247/*! Destroy the h.
248 *
249 * \param[in] h Hashtable to destroy
250 * \param[in] destroy_vals M_TRUE if the values held by the h should be destroyed.
251 * This will almost always be M_TRUE. This should only be set to M_FALSE
252 * when all values held by the h are being managed externally.
253 */
254M_API void M_hashtable_destroy(M_hashtable_t *h, M_bool destroy_vals) M_FREE(1);
255
256
257/*! Insert an entry into the h.
258 *
259 * \param[in] h Hashtable being referenced.
260 * \param[in] key Key to insert.
261 * \param[in] value Value to insert into h. Value will not be duplicated.
262 * The h will take ownership of the value. Maybe NULL.
263 *
264 * \return M_TRUE on success, or M_FALSE on failure.
265 */
266M_API M_bool M_hashtable_insert(M_hashtable_t *h, const void *key, const void *value);
267
268
269/*! Remove an entry from the h.
270 *
271 * \param[in] h Hashtable being referenced.
272 * \param[in] key Key to remove from the h.
273 * \param[in] destroy_vals M_TRUE if the value held by the h should be destroyed.
274 * This will almost always be M_TRUE. This should only be set to M_FALSE
275 * when the value held by the h is being managed externally.
276 *
277 * \return M_TRUE on success, or M_FALSE if key does not exist.
278 */
279M_API M_bool M_hashtable_remove(M_hashtable_t *h, const void *key, M_bool destroy_vals);
280
281
282/*! Retrieve the value for a key from the h.
283 *
284 * \param[in] h Hashtable being referenced.
285 * \param[in] key Key for value.
286 * \param[out] value Pointer to value stored in the h. Optional, pass NULL if not needed.
287 *
288 * \return M_TRUE if value retrieved, M_FALSE if key does not exist.
289 */
290M_API M_bool M_hashtable_get(const M_hashtable_t *h, const void *key, void **value);
291
292/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
293
294/*! Wether the hashtable a multi value table.
295 *
296 * \param[in] h Hashtable being referenced.
297 *
298 * \return M_TRUE if a multi value hashtable.
299 */
301
302
303/*! Get the number of values for a given key.
304 *
305 * \param[in] h Hashtable being referenced.
306 * \param[in] key Key for value to retrieve.
307 * \param[out] len The number of values.
308 *
309 * \return M_TRUE if length is retrieved, M_FALSE if key does not exist.
310 */
311M_API M_bool M_hashtable_multi_len(const M_hashtable_t *h, const void *key, size_t *len);
312
313
314/*! Retrieve the value for a key from the given index when supporting muli-values.
315 *
316 * \param[in] h Hashtable being referenced.
317 * \param[in] key Key for value to retrieve.
318 * \param[in] idx The index the value resides at.
319 * \param[out] value Pointer to value stored. Optional, pass NULL if not needed.
320 *
321 * \return M_TRUE if value retrieved, M_FALSE if key does not exist
322 */
323M_API M_bool M_hashtable_multi_get(const M_hashtable_t *h, const void *key, size_t idx, void **value);
324
325
326/*! Remove a value from the h when supporting muli-values.
327 *
328 * If all values have been removed then the key will be removed.
329 *
330 * \param[in] h Hashtable being referenced
331 * \param[in] key Key for value to retrieve.
332 * \param[in] idx The index the value resides at.
333 * \param[in] destroy_vals M_TRUE if the value held by the h should be destroyed.
334 * This will almost always be M_TRUE. This should only be set to M_FALSE
335 * when the value held by the h is being managed externally.
336 *
337 * \return M_TRUE if the value was removed, M_FALSE if key does not exist.
338 */
339M_API M_bool M_hashtable_multi_remove(M_hashtable_t *h, const void *key, size_t idx, M_bool destroy_vals);
340
341/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
342
343/*! Retrieve the current size (number of buckets/slots, not necessarily used).
344 *
345 * \param[in] h Hashtable being referenced.
346 *
347 * \return Size of the h
348 */
349M_API M_uint32 M_hashtable_size(const M_hashtable_t *h);
350
351
352/*! Retrieve the number of collisions for h entries that has occurred since creation.
353 *
354 * \param[in] h Hashtable being referenced.
355 *
356 * \return Number of collisions.
357 */
359
360
361/*! Retrieve the number of expansions/rehashes since creation.
362 *
363 * \param[in] h Hashtable being referenced.
364 *
365 * \return number of expansions/rehashes.
366 */
368
369
370/*! Retrieve the number of entries in the h.
371 *
372 * This is the number of keys stored.
373 *
374 * \param[in] h Hashtable being referenced.
375 *
376 * \return number of entries in the h.
377 */
378M_API size_t M_hashtable_num_keys(const M_hashtable_t *h);
379
380/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
381
382/*! Start an enumeration of the keys within a h.
383 *
384 * \param[in] h Hashtable being referenced.
385 * \param[out] hashenum Outputs an initialized state variable for starting an enumeration.
386 *
387 * \return Number of items in the h
388 */
389M_API size_t M_hashtable_enumerate(const M_hashtable_t *h, M_hashtable_enum_t *hashenum);
390
391
392/*! Retrieve the next item from a h enumeration.
393 *
394 * If multi-value, keys will appear multiple times as each value will be
395 * retrieved individually.
396 *
397 * \param[in] h Hashtable being referenced.
398 * \param[in,out] hashenum State variable for tracking the enumeration process.
399 * \param[out] key Value of next enumerated key. Optional, pass NULL if not needed.
400 * \param[out] value Value of next enumerated value. Optional, pass NULL if not needed.
401 *
402 * \return M_TRUE if enumeration succeeded, M_FALSE if no more keys.
403 */
404M_API M_bool M_hashtable_enumerate_next(const M_hashtable_t *h, M_hashtable_enum_t *hashenum, const void **key, const void **value);
405
406/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
407
408/*! Merge two hashtables together.
409 *
410 * The second (src) h will be destroyed automatically upon completion of this function. Any key/value
411 * pointers for the h will be directly copied over to the destination h, they will not be
412 * duplicated. Any keys which exist in 'dest' that also exist in 'src' will be overwritten by the 'src' value.
413 *
414 * If dest and src are multi-value, all values from src will be copied into dest and the values from
415 * dest will not be removed. If dest is not multi-value and src is, then only the last value in src will
416 * be present in dest. If dest is multi-value and src is not, then the value from src will be added to dest.
417 * A value_equality function in dest is very important to ensure duplicate values are not present in a given
418 * key with multiple values.
419 *
420 * \param[in,out] dest Pointer by reference to the h receiving the key/value pairs.
421 * if dest is NULL, the src address will simply be copied to dest.
422 * \param[in,out] src Pointer to the h giving up its key/value pairs.
423 */
425
426
427/*! Duplicate an existing h.
428 *
429 * Copying all keys and values. As well as other elements such as callbacks.
430 *
431 * \param[in] h Hashtable to be copied.
432 *
433 * \return Duplicated h.
434 */
436
437/*! @} */
438
439__END_DECLS
440
441#endif
union M_hashtable_enum::@0 entry
size_t valueidx
Definition: m_hashtable.h:142
M_hashtable_duplicate_func key_duplicate_copy
Definition: m_hashtable.h:182
M_hashtable_duplicate_func value_duplicate_copy
Definition: m_hashtable.h:188
M_hashtable_duplicate_func key_duplicate_insert
Definition: m_hashtable.h:180
M_hashtable_duplicate_func value_duplicate_insert
Definition: m_hashtable.h:186
M_hashtable_free_func value_free
Definition: m_hashtable.h:193
M_sort_compar_t value_equality
Definition: m_hashtable.h:190
M_hashtable_free_func key_free
Definition: m_hashtable.h:184
M_hashtable_t * M_hashtable_duplicate(const M_hashtable_t *h)
size_t M_hashtable_num_collisions(const M_hashtable_t *h)
struct M_hashtable M_hashtable_t
Definition: m_hashtable.h:123
M_bool M_hashtable_multi_get(const M_hashtable_t *h, const void *key, size_t idx, void **value)
void(* M_hashtable_free_func)(void *)
Definition: m_hashtable.h:153
size_t M_hashtable_num_keys(const M_hashtable_t *h)
M_bool M_hashtable_multi_len(const M_hashtable_t *h, const void *key, size_t *len)
void *(* M_hashtable_duplicate_func)(const void *)
Definition: m_hashtable.h:150
M_bool M_hashtable_enumerate_next(const M_hashtable_t *h, M_hashtable_enum_t *hashenum, const void **key, const void **value)
M_bool M_hashtable_multi_remove(M_hashtable_t *h, const void *key, size_t idx, M_bool destroy_vals)
M_bool M_hashtable_remove(M_hashtable_t *h, const void *key, M_bool destroy_vals)
M_uint32(* M_hashtable_hash_func)(const void *, M_uint32)
Definition: m_hashtable.h:147
size_t M_hashtable_num_expansions(const M_hashtable_t *h)
M_hashtable_t * M_hashtable_create(size_t size, M_uint8 fillpct, M_hashtable_hash_func key_hash, M_sort_compar_t key_equality, M_uint32 flags, const struct M_hashtable_callbacks *callbacks) M_MALLOC
M_uint32 M_hashtable_size(const M_hashtable_t *h)
M_bool M_hashtable_insert(M_hashtable_t *h, const void *key, const void *value)
M_hashtable_flags_t
Definition: m_hashtable.h:198
void M_hashtable_merge(M_hashtable_t **dest, M_hashtable_t *src)
M_bool M_hashtable_is_multi(const M_hashtable_t *h)
void M_hashtable_destroy(M_hashtable_t *h, M_bool destroy_vals) M_FREE(1)
M_bool M_hashtable_get(const M_hashtable_t *h, const void *key, void **value)
size_t M_hashtable_enumerate(const M_hashtable_t *h, M_hashtable_enum_t *hashenum)
@ M_HASHTABLE_KEYS_SORTED
Definition: m_hashtable.h:202
@ M_HASHTABLE_MULTI_VALUE
Definition: m_hashtable.h:203
@ M_HASHTABLE_NONE
Definition: m_hashtable.h:199
@ M_HASHTABLE_MULTI_GETLAST
Definition: m_hashtable.h:206
@ M_HASHTABLE_MULTI_SORTED
Definition: m_hashtable.h:205
@ M_HASHTABLE_KEYS_ORDERED
Definition: m_hashtable.h:200
@ M_HASHTABLE_STATIC_SEED
Definition: m_hashtable.h:208
Definition: m_hashtable.h:179
Definition: m_hashtable.h:128
struct M_llist_node M_llist_node_t
Definition: m_llist.h:78
int(* M_sort_compar_t)(const void *arg1, const void *arg2, void *thunk)
Definition: m_sort.h:78