# 练习37：哈希表

``````fruit_weights = {'Apples': 10, 'Oranges': 100, 'Grapes': 1.0}

for key, value in fruit_weights.items():
print key, "=", value
``````

``````#ifndef _lcthw_Hashmap_h
#define _lcthw_Hashmap_h

#include <stdint.h>
#include <lcthw/darray.h>

#define DEFAULT_NUMBER_OF_BUCKETS 100

typedef int (*Hashmap_compare)(void *a, void *b);
typedef uint32_t (*Hashmap_hash)(void *key);

typedef struct Hashmap {
DArray *buckets;
Hashmap_compare compare;
Hashmap_hash hash;
} Hashmap;

typedef struct HashmapNode {
void *key;
void *data;
uint32_t hash;
} HashmapNode;

typedef int (*Hashmap_traverse_cb)(HashmapNode *node);

Hashmap *Hashmap_create(Hashmap_compare compare, Hashmap_hash);
void Hashmap_destroy(Hashmap *map);

int Hashmap_set(Hashmap *map, void *key, void *data);
void *Hashmap_get(Hashmap *map, void *key);

int Hashmap_traverse(Hashmap *map, Hashmap_traverse_cb traverse_cb);

void *Hashmap_delete(Hashmap *map, void *key);

#endif
``````

`DArray *buckets`

`Hashmap_compare compare`

`Hashmap_hash`

• 第一层有100个桶，数据基于它们的哈希值储存在桶中。
• 每个桶都是一个`DArray`，其中含有`HashmapNode`，添加时只是简单地附加到末尾。

`HashMapNode`由下面三个元素组成：

`void *key`

`void *value`

`uint32_t hash`

``````#undef NDEBUG
#include <stdint.h>
#include <lcthw/hashmap.h>
#include <lcthw/dbg.h>
#include <lcthw/bstrlib.h>

static int default_compare(void *a, void *b)
{
return bstrcmp((bstring)a, (bstring)b);
}

/**
* Simple Bob Jenkins's hash algorithm taken from the
* wikipedia description.
*/
static uint32_t default_hash(void *a)
{
size_t len = blength((bstring)a);
char *key = bdata((bstring)a);
uint32_t hash = 0;
uint32_t i = 0;

for(hash = i = 0; i < len; ++i)
{
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}

hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);

return hash;
}

Hashmap *Hashmap_create(Hashmap_compare compare, Hashmap_hash hash)
{
Hashmap *map = calloc(1, sizeof(Hashmap));
check_mem(map);

map->compare = compare == NULL ? default_compare : compare;
map->hash = hash == NULL ? default_hash : hash;
map->buckets = DArray_create(sizeof(DArray *), DEFAULT_NUMBER_OF_BUCKETS);
map->buckets->end = map->buckets->max; // fake out expanding it
check_mem(map->buckets);

return map;

error:
if(map) {
Hashmap_destroy(map);
}

return NULL;
}

void Hashmap_destroy(Hashmap *map)
{
int i = 0;
int j = 0;

if(map) {
if(map->buckets) {
for(i = 0; i < DArray_count(map->buckets); i++) {
DArray *bucket = DArray_get(map->buckets, i);
if(bucket) {
for(j = 0; j < DArray_count(bucket); j++) {
free(DArray_get(bucket, j));
}
DArray_destroy(bucket);
}
}
DArray_destroy(map->buckets);
}

free(map);
}
}

static inline HashmapNode *Hashmap_node_create(int hash, void *key, void *data)
{
HashmapNode *node = calloc(1, sizeof(HashmapNode));
check_mem(node);

node->key = key;
node->data = data;
node->hash = hash;

return node;

error:
return NULL;
}

static inline DArray *Hashmap_find_bucket(Hashmap *map, void *key,
int create, uint32_t *hash_out)
{
uint32_t hash = map->hash(key);
int bucket_n = hash % DEFAULT_NUMBER_OF_BUCKETS;
check(bucket_n >= 0, "Invalid bucket found: %d", bucket_n);
*hash_out = hash; // store it for the return so the caller can use it

DArray *bucket = DArray_get(map->buckets, bucket_n);

if(!bucket && create) {
// new bucket, set it up
bucket = DArray_create(sizeof(void *), DEFAULT_NUMBER_OF_BUCKETS);
check_mem(bucket);
DArray_set(map->buckets, bucket_n, bucket);
}

return bucket;

error:
return NULL;
}

int Hashmap_set(Hashmap *map, void *key, void *data)
{
uint32_t hash = 0;
DArray *bucket = Hashmap_find_bucket(map, key, 1, &hash);
check(bucket, "Error can't create bucket.");

HashmapNode *node = Hashmap_node_create(hash, key, data);
check_mem(node);

DArray_push(bucket, node);

return 0;

error:
return -1;
}

static inline int Hashmap_get_node(Hashmap *map, uint32_t hash, DArray *bucket, void *key)
{
int i = 0;

for(i = 0; i < DArray_end(bucket); i++) {
debug("TRY: %d", i);
HashmapNode *node = DArray_get(bucket, i);
if(node->hash == hash && map->compare(node->key, key) == 0) {
return i;
}
}

return -1;
}

void *Hashmap_get(Hashmap *map, void *key)
{
uint32_t hash = 0;
DArray *bucket = Hashmap_find_bucket(map, key, 0, &hash);
if(!bucket) return NULL;

int i = Hashmap_get_node(map, hash, bucket, key);
if(i == -1) return NULL;

HashmapNode *node = DArray_get(bucket, i);
check(node != NULL, "Failed to get node from bucket when it should exist.");

return node->data;

error: // fallthrough
return NULL;
}

int Hashmap_traverse(Hashmap *map, Hashmap_traverse_cb traverse_cb)
{
int i = 0;
int j = 0;
int rc = 0;

for(i = 0; i < DArray_count(map->buckets); i++) {
DArray *bucket = DArray_get(map->buckets, i);
if(bucket) {
for(j = 0; j < DArray_count(bucket); j++) {
HashmapNode *node = DArray_get(bucket, j);
rc = traverse_cb(node);
if(rc != 0) return rc;
}
}
}

return 0;
}

void *Hashmap_delete(Hashmap *map, void *key)
{
uint32_t hash = 0;
DArray *bucket = Hashmap_find_bucket(map, key, 0, &hash);
if(!bucket) return NULL;

int i = Hashmap_get_node(map, hash, bucket, key);
if(i == -1) return NULL;

HashmapNode *node = DArray_get(bucket, i);
void *data = node->data;
free(node);

HashmapNode *ending = DArray_pop(bucket);

if(ending != node) {
// alright looks like it's not the last one, swap it
DArray_set(bucket, i, ending);
}

return data;
}
``````

• 首先调用`map->hash(key)`来获得键的哈希值。
• 之后使用`hash % DEFAULT_NUMBER_OF_BUCKETS`，这样无论哈希值有多大，都能找到匹配的桶。
• 找到桶之后，它是个`DArray`，可能还没有创建，这取决与`create`变量的内容。
• 一旦找到了正确的`DArray`桶，就会将它返回，并且`hash_out`变量用于向调用者提供所找到的哈希值。

• 设置键值对涉及到找到正确的桶，之后创建`HashmapNode`，将它添加到桶中。
• 获取键值涉及到找到正确的桶，之后找到匹配`hash``key``HashmapNode`
• 删除元素也需要找到正确的桶，找到所需的节点，之后通过与末尾的节点交换位置来删除。

## 单元测试

``````#include "minunit.h"
#include <lcthw/hashmap.h>
#include <assert.h>
#include <lcthw/bstrlib.h>

Hashmap *map = NULL;
static int traverse_called = 0;
struct tagbstring test1 = bsStatic("test data 1");
struct tagbstring test2 = bsStatic("test data 2");
struct tagbstring test3 = bsStatic("xest data 3");
struct tagbstring expect1 = bsStatic("THE VALUE 1");
struct tagbstring expect2 = bsStatic("THE VALUE 2");
struct tagbstring expect3 = bsStatic("THE VALUE 3");

static int traverse_good_cb(HashmapNode *node)
{
debug("KEY: %s", bdata((bstring)node->key));
traverse_called++;
return 0;
}

static int traverse_fail_cb(HashmapNode *node)
{
debug("KEY: %s", bdata((bstring)node->key));
traverse_called++;

if(traverse_called == 2) {
return 1;
} else {
return 0;
}
}

char *test_create()
{
map = Hashmap_create(NULL, NULL);
mu_assert(map != NULL, "Failed to create map.");

return NULL;
}

char *test_destroy()
{
Hashmap_destroy(map);

return NULL;
}

char *test_get_set()
{
int rc = Hashmap_set(map, &test1, &expect1);
mu_assert(rc == 0, "Failed to set &test1");
bstring result = Hashmap_get(map, &test1);
mu_assert(result == &expect1, "Wrong value for test1.");

rc = Hashmap_set(map, &test2, &expect2);
mu_assert(rc == 0, "Failed to set test2");
result = Hashmap_get(map, &test2);
mu_assert(result == &expect2, "Wrong value for test2.");

rc = Hashmap_set(map, &test3, &expect3);
mu_assert(rc == 0, "Failed to set test3");
result = Hashmap_get(map, &test3);
mu_assert(result == &expect3, "Wrong value for test3.");

return NULL;
}

char *test_traverse()
{
int rc = Hashmap_traverse(map, traverse_good_cb);
mu_assert(rc == 0, "Failed to traverse.");
mu_assert(traverse_called == 3, "Wrong count traverse.");

traverse_called = 0;
rc = Hashmap_traverse(map, traverse_fail_cb);
mu_assert(rc == 1, "Failed to traverse.");
mu_assert(traverse_called == 2, "Wrong count traverse for fail.");

return NULL;
}

char *test_delete()
{
bstring deleted = (bstring)Hashmap_delete(map, &test1);
mu_assert(deleted != NULL, "Got NULL on delete.");
mu_assert(deleted == &expect1, "Should get test1");
bstring result = Hashmap_get(map, &test1);
mu_assert(result == NULL, "Should delete.");

deleted = (bstring)Hashmap_delete(map, &test2);
mu_assert(deleted != NULL, "Got NULL on delete.");
mu_assert(deleted == &expect2, "Should get test2");
result = Hashmap_get(map, &test2);
mu_assert(result == NULL, "Should delete.");

deleted = (bstring)Hashmap_delete(map, &test3);
mu_assert(deleted != NULL, "Got NULL on delete.");
mu_assert(deleted == &expect3, "Should get test3");
result = Hashmap_get(map, &test3);
mu_assert(result == NULL, "Should delete.");

return NULL;
}

char *all_tests()
{
mu_suite_start();

mu_run_test(test_create);
mu_run_test(test_get_set);
mu_run_test(test_traverse);
mu_run_test(test_delete);
mu_run_test(test_destroy);

return NULL;
}

RUN_TESTS(all_tests);
``````

## 如何改进

• 你可以对每个桶进行排序，使它们有序。这会增加你的插入时间，但是减少寻找时间，因为你可以使用二分搜索来寻找每个节点。到现在为止它遍历桶中的所有节点来寻找元素。
• 你可以动态设定桶的数量，或者让调用者指定每个`Hashmap`中的该值。
• 你可以使用更好的`default_hash`函数，有许多这样的函数。
• 这个实现以及几乎所有实现都有将一些特定的键存到一个桶中的风险。这会使你的程序运行速度变慢，因为它使`Hashmap`的处理过程变成了处理单个的`DArray`。如果你对桶中的数组排序会有帮助，但是你可以仅仅使用更好的哈希函数来避免，并且对于真正的偏执狂，你可以添加一个随机的盐，让键不可预测。
• 你可以删掉不歪有任何节点的桶来节约空间，或者将空的桶当如缓存中，便于节约创建和销毁它们的开销。
• 现在为止它可以添加已存在的元素，编写一个替代的实现，使它只能够添加不存在的元素。

## 附加题

• 研究你最喜欢的编程语言的`Hashmap`实现，了解它们具有什么特性。
• 找到`Hashmap`的主要缺点，以及如何避免它们。例如，它们不做特定的修改就不能保存顺序，并且当你基于键的一部分来查找元素时，它们就不能生效。
• 编写单元测试来展示将键都填充到`Hashmap`的一个桶中所带来的缺陷，之后测试这样如何影响性能。一个实现它的好方法，就是把桶的数量减少到一个愚蠢的数值，比如1。