From 9130e47b171c5182ffe6c14eb710fdcb73943de4 Mon Sep 17 00:00:00 2001 From: Katharina Fey Date: Sat, 13 Jul 2019 06:40:05 +0100 Subject: Adding initial support for HASH nodes This PR adds initial support for HASH data nodes in libbowl. This allows a performant key-value store lookup in a node tree. The hashing code implements the "murmur" hash, which has shown good performance over at [`libcuckoo`]. Currently there is no extended hashing strategy, which should definitely be changed. [`libcuckoo`]: https://github.com/qaul/libcuckoo (currently a collision will cause a recursive re-alloc) Some of the type-level hacks also begs the question if a PAIR data node might be warranted, even though it would break the simple design around bowl->data. --- CMakeLists.txt | 9 +++---- bowl.c | 19 ++++++++++++--- bowl.h | 19 ++++++++------- examples/list.c | 8 +++---- examples/tree.c | 10 ++++---- hash.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ hash.h | 11 +++++++++ utils.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ utils.h | 4 +++- 9 files changed, 198 insertions(+), 25 deletions(-) create mode 100644 hash.c create mode 100644 hash.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ab2f46..abe82d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,10 +3,11 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c11") set(CMAKE_BUILD_TYPE Debug) project(bowl) -add_library(bowl SHARED bowl.c +add_library(bowl SHARED array.c + bowl.c data.c - utils.c - array.c) + hash.c + utils.c) target_include_directories(bowl PUBLIC ".") @@ -15,4 +16,4 @@ add_executable(ex_tree examples/tree.c) target_link_libraries(ex_tree bowl) add_executable(ex_list examples/list.c) -target_link_libraries(ex_list bowl) \ No newline at end of file +target_link_libraries(ex_list bowl) diff --git a/bowl.c b/bowl.c index 4eeeb59..3cb36b9 100644 --- a/bowl.c +++ b/bowl.c @@ -1,5 +1,6 @@ #include "bowl.h" #include "array.h" +#include "hash.h" #include #include @@ -37,7 +38,7 @@ err_t bowl_malloc(struct bowl **ptr, bowl_t type) return OK; } -err_t bowl_insert(struct bowl *self, struct bowl *new) +err_t bowl_append(struct bowl *self, struct bowl *new) { CHECK(self, INVALID_PARAMS) CHECK(new, INVALID_PARAMS) @@ -52,7 +53,19 @@ err_t bowl_insert(struct bowl *self, struct bowl *new) return OK; } -err_t bowl_insert_key(struct bowl *self, size_t idx, struct bowl *child) +err_t bowl_insert_key(struct bowl *self, char *key, struct bowl *child) +{ + CHECK(self, INVALID_PARAMS) + CHECK(child, INVALID_PARAMS) + CHECK(key, INVALID_PARAMS) + + switch(self->type) { + case HASH: return hash_insert_key(self, key, child); + default: return INVALID_PARAMS; + } +} + +err_t bowl_insert_idx(struct bowl *self, size_t idx, struct bowl *child) { CHECK(self, INVALID_PARAMS) CHECK(child, INVALID_PARAMS) @@ -66,7 +79,7 @@ err_t bowl_insert_key(struct bowl *self, size_t idx, struct bowl *child) } } -err_t bowl_swap_key(struct bowl *self, size_t idx, struct bowl *child, struct bowl **prev) +err_t bowl_swap_idx(struct bowl *self, size_t idx, struct bowl *child, struct bowl **prev) { CHECK(self, INVALID_PARAMS) CHECK(child, INVALID_PARAMS) diff --git a/bowl.h b/bowl.h index 63149d2..ae7efe3 100644 --- a/bowl.h +++ b/bowl.h @@ -12,7 +12,7 @@ extern "C" { #define CHECK(ptr, err) if(!ptr) return err; /** Define some generic error codes first that we can propagate **/ -typedef enum err_t { +typedef enum { OK = 0, ERR = -1, NOT_IMPLEMENTED = 1, // A runtime error for missing features @@ -22,7 +22,7 @@ typedef enum err_t { INVALID_STATE = 4, // Calling an operation on an invalid state } err_t; -typedef enum data_t { +typedef enum { LITERAL, // Any string that is \0 terminated INTEGER, // Integer value REAL, // Floating point value @@ -30,7 +30,7 @@ typedef enum data_t { RAW // A platform-length pointer } data_t; -typedef enum bowl_t { +typedef enum { LEAF = 1, ARRAY, HASH, LINKED } bowl_t; @@ -67,14 +67,17 @@ struct b_ptr { /// Allocate memory for an new, empty bowl node err_t bowl_malloc(struct bowl **, bowl_t); -/// Insert one node into another -err_t bowl_insert(struct bowl *, struct bowl *); +/// Insert one node at the end of another +err_t bowl_append(struct bowl *, struct bowl *); + +/// Insert a node under a specific key (relevant for HASH nodes) +err_t bowl_insert_key(struct bowl *, char *key, struct bowl *); /// Insert a node into a specific index of another node -err_t bowl_insert_key(struct bowl *, size_t idx, struct bowl *); +err_t bowl_insert_idx(struct bowl *, size_t idx, struct bowl *); /// Insert and swap a key in place, returning the old one -err_t bowl_swap_key(struct bowl *, size_t idx, struct bowl *, struct bowl **); +err_t bowl_swap_idx(struct bowl *, size_t idx, struct bowl *, struct bowl **); /// Remove a bowl node by it's pointer reference err_t bowl_remove(struct bowl *, struct bowl *); @@ -94,4 +97,4 @@ err_t data_free(struct data *); #ifdef __cplusplus } #endif -#endif // _BOWL_H_ \ No newline at end of file +#endif // _BOWL_H_ diff --git a/examples/list.c b/examples/list.c index a09588b..84205e2 100644 --- a/examples/list.c +++ b/examples/list.c @@ -9,15 +9,15 @@ int main() struct bowl *a; data_malloc(&a, LITERAL, "Destroy capitalism"); - bowl_insert(root, a); + bowl_append(root, a); struct bowl *b; data_malloc(&b, INTEGER, 1312); - bowl_insert(root, b); + bowl_append(root, b); struct bowl *c; data_malloc(&c, LITERAL, "Alerta, Antifascista!"); - bowl_insert(root, c); + bowl_append(root, c); return bowl_free(root); -} \ No newline at end of file +} diff --git a/examples/tree.c b/examples/tree.c index e55acb9..eb1b33a 100644 --- a/examples/tree.c +++ b/examples/tree.c @@ -30,17 +30,17 @@ int main() if(e) return e; // Add the d node to c - e = bowl_insert(c, d); + e = bowl_append(c, d); if(e) e; // Add other nodes to root - e = bowl_insert(root, a); + e = bowl_append(root, a); if(e) return e; - e = bowl_insert(root, b); + e = bowl_append(root, b); if(e) return e; - e = bowl_insert(root, c); + e = bowl_append(root, c); if(e) return e; e = bowl_free(root); return e; -} \ No newline at end of file +} diff --git a/hash.c b/hash.c new file mode 100644 index 0000000..689bf18 --- /dev/null +++ b/hash.c @@ -0,0 +1,73 @@ +#include "bowl.h" +#include "array.h" +#include "hash.h" +#include "utils.h" + +#include + +typedef struct { + char *key; + struct bowl *data; +} hash_item; + +err_t hash_insert_key(struct bowl *self, char *key, struct bowl *child) +{ + CHECK(self, INVALID_STATE) + CHECK(child, INVALID_STATE) + CHECK(key, INVALID_STATE) + + // Even though we're a HASH node, we use an array under the hood + struct bowl_arr *arr = self->_pl.array; + CHECK(arr, INVALID_STATE) + + size_t idx; + err_t e = _hash(key, arr->size, &idx); + if(e) return e; + + // If the index is used, rescale and try again + if(arr->ptr[idx] != NULL) { + struct bowl *prev = self; + + // Allocate new array + e = array_malloc(self, arr->size * 2); + if(e) return e; + + for(int i = 0; i < arr->size; i++) { + if(arr->ptr[i] != NULL) { + // FIXME: This is horrible + hash_item *item = (hash_item*) arr->ptr[i]; + e = hash_insert_key(self, item->key, item->data); + if(e) return e; + } + } + + e = array_free(prev); + if(e) return e; + + e = hash_insert_key(self, key, child); + if(e) return e; + + } else { + hash_item *item = malloc(sizeof(hash_item)); + CHECK(item, MALLOC_FAILED) + + item->key = calloc(sizeof(char), REAL_STRLEN(key)); + CHECK(item->key, MALLOC_FAILED) + strcpy(item->key, key); + + item->data = child; + + arr->ptr[idx] = (struct bowl*) item; + } + + return OK; +} + +err_t hash_remove_key(struct bowl *self, char *key, struct bowl **child) +{ + CHECK(self, INVALID_STATE) + CHECK(key, INVALID_STATE) + + return OK; +} + diff --git a/hash.h b/hash.h new file mode 100644 index 0000000..a5d763d --- /dev/null +++ b/hash.h @@ -0,0 +1,11 @@ +#ifndef HASH_H_ +#define HASH_H_ + +#include "bowl.h" + +err_t hash_insert_key(struct bowl *, char *key, struct bowl *); + +err_t hash_remove_key(struct bowl *, char *key, struct bowl **); + +#endif // HASH_H_ + diff --git a/utils.c b/utils.c index 9e5a027..f08c670 100644 --- a/utils.c +++ b/utils.c @@ -1,4 +1,5 @@ #include +#include #include #include "utils.h" @@ -44,3 +45,72 @@ err_t _array_remove(void **ptr, size_t idx, size_t len, void **out) return OK; } + +err_t _hash(char *str, size_t len, size_t *out) +{ + CHECK(str, INVALID_PARAMS) + CHECK((len < 0), INVALID_PARAMS) + + // Implements the "murmur" non-cryptographic hash + +#define C1 0xcc9e2d51 +#define C2 0x1b873593 +#define N 0xe6546b64 +#define M 5 +#define C3 0x85ebca6b +#define C4 0xc2b2ae35 +#define R1 15 +#define R2 13 +#define R3 16 +#define R4 13 +#define ROTL32(v, shift) ( ((v) << (shift)) | ( (v) >> (32 - (shift))) ) + + size_t key_len = strlen(str); + int l = (int) key_len / 4; + uint32_t seed = 0xAF172FE; + int i = 0; + + uint32_t k = 0; + uint32_t h = seed; + uint8_t *d = (uint8_t *) str; + const uint32_t *chunks = (const uint32_t *)(d + l * 4); + const uint8_t *tail = (const uint8_t *)(d + l * 4); + + for (i = -l; i != 0; ++i) { + k = chunks[i]; + k *= C1; + k = ROTL32(k, R1); + k *= C2; + h ^= k; + h = ROTL32(k, R2); + h = h * M + N; + } + + k = 0; + switch(key_len & 3) { + case 3: + k ^= (tail[2] << 16); + case 2: + k ^= (tail[1] << 8); + case 1: + k ^= tail[0]; + k *= C1; + k = ROTL32(k, R1); + k *= C2; + h ^= k; + default: break; + } + + /* Finalised hash */ + h ^= key_len; + h ^= (h >> R3); + h *= C3; + h ^= (h >> R4); + h *= C4; + h ^= (h >> R3); + + /* Finalise ✨ */ + h %= len; + *out = (size_t) h; + return OK; +} diff --git a/utils.h b/utils.h index c7c2178..4fa7a90 100644 --- a/utils.h +++ b/utils.h @@ -13,7 +13,9 @@ err_t _array_search(void **, size_t, size_t *out, void *in); err_t _array_remove(void **, size_t idx, size_t len, void **out); +err_t _hash(char *str, size_t len, size_t *out); + #ifdef __cplusplus } #endif -#endif // _UTIL_H_ \ No newline at end of file +#endif // _UTIL_H_ -- cgit v1.2.3