Lookups on known set of integer keys

前端 未结 3 1958
名媛妹妹
名媛妹妹 2020-11-30 12:12

Gperf consistently underperforms a Judy array in my environment, and I\'m wondering if there is another perfect hash library out there built specifically for integer keys. I

相关标签:
3条回答
  • 2020-11-30 12:38

    Have you tried judy arrays? Might be exactly what you need.

    0 讨论(0)
  • 2020-11-30 12:40
    /*
    ** Proof of concept for constructing a {fixed-size,lookup-only} hashtable
    ** needing only (2*N* sizeof(int)) additional storage for storing N num=value pairs.
    ** The key is supposed to be an int,
    ** the 'value' is a char.
    ** Note: some people would like to include <stdint.h> and replace all the ints by {uint32_t,uint16_t,uint8_t}.
    **
    ** (c)opyright Wildplasser, 2011-11-12
    ** href = http://stackoverflow.com/questions/8059591/lookups-on-known-set-of-integer-keys
    */
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <assert.h>
    
    struct tinyhash {
        unsigned key;
        unsigned short link;
        unsigned char value;
        unsigned is_linked :1;
        };
    #define LINK_DEAD ((unsigned short)-1)
    
        /* A hashtable with N slots for N entries (100% full) */
    #define THE_SIZE 1000
    struct tinyhash table[THE_SIZE] ;
    
    void tiny_fill(void);
    void tiny_init(void);
    int tiny_find(unsigned key);
    
    void tiny_print(void);
    void tiny_count(void);
    
    static int tiny_cmp( const void *l, const void *r);
    static unsigned short * tiny_hnd(unsigned key);
    static unsigned tiny_hash(unsigned key);
    
    int main (void)
    {
    
    assert(sizeof table == 2 * THE_SIZE * sizeof(int));
    
    fprintf (stderr, "Size=%u\n", (unsigned int) sizeof table);
    
    tiny_fill();
    tiny_init();
    tiny_print();
    tiny_count();
    
    return 0;
    }
        /* Perform a table lookup.
        ** Return the "value" that belongs to "key"..
        ** If not found -1 is returned.
        */
    int tiny_find(unsigned key)
    {
    unsigned short *ptr;
    ptr = tiny_hnd(key);
    return *ptr==LINK_DEAD ? -1 : table[*ptr].value ;
    }
    
        /* Find the place where key is located, or
        ** (if not found) where it should be appendend.
        ** The returned value is a pointer to the parent's .link field.
        */
    static unsigned short * tiny_hnd(unsigned key)
    {
    unsigned hash;
    unsigned short slot, *ptr;
    
    hash = tiny_hash(key);
    slot = hash % THE_SIZE;
    for (ptr = &table[slot].link; *ptr != LINK_DEAD; ptr = &table[*ptr].link ) {
        if ( !table[*ptr].is_linked ) break;
        if ( table[*ptr].key == key) break;
        }
    return ptr;
    }
    
        /* For testing: fill hashtable with garbage */
    void tiny_fill(void)
    {
    unsigned idx;
    for (idx=0; idx < THE_SIZE; idx++ ) {
        table[idx].key = rand() + 543 * rand();
        table[idx].value = rand() ;
            table[idx].link = LINK_DEAD;
            table[idx].is_linked = 0;
        }
    }
        /* Build hashtable, that is:
        ** shuffle the entries and build linked list.
        */
    void tiny_init(void)
    {
    unsigned idx;
    
        /* Phase 0: set all to unused.
        ** The link field is temporaly abused to store the intended
        ** slotnumber.
        */
    for (idx=0; idx < THE_SIZE; idx++ ) {
            table[idx].link = tiny_hash(table[idx].key) % THE_SIZE;
            table[idx].is_linked = 0;
        }
    
        /* Phase 0a: sort on (intended) slotnumber. */
    qsort (table, THE_SIZE, sizeof table[0] , tiny_cmp);
    
        /* Phase 1: put enties at their intended slotnumber
        ** but only if the entry that lives there does not belong there
        ** (is uninitialized).
        */
    for (idx=0; idx < THE_SIZE; idx++) {
        unsigned slot;
            /* [idx] has allready been placed */
        if (table[idx].is_linked) continue;
        slot = table[idx].link;
             /* [idx] belongs here. freeze it */
        if (slot==idx) {
            table[idx].link = LINK_DEAD;
            table[idx].is_linked = 1;
            }
            /* try to swap [idx] with the item at its intended slot */
        else {
            struct tinyhash tmp;
                /* item[slot] belongs there: keep off */
            if (table[slot].is_linked) continue;
            tmp = table[idx];
            table[idx] = table[slot];
            table[slot] = tmp;
            table[slot].is_linked = 1;
            table[slot].link = LINK_DEAD;
                /* Don't bump idx: [idx] and [slot] have been swapped,
                ** we need to inspect the new item[idx] at the next cycle.
                */
            idx--; /* idx will be re-incremented in the loop; */
            }
        }
    
        /* Phase 2: link any remaining unplaced item to its
        ** parent in the LL-chain.
        */
    for (idx=0; idx < THE_SIZE; idx++ ) {
        unsigned short *parent;
        if (table[idx].is_linked) continue;
        parent = tiny_hnd(table[idx].key);
        if (*parent != LINK_DEAD) continue; /* duplicate */
        *parent = idx;
        table[idx].is_linked = 1;
        table[idx].link = LINK_DEAD;
        }
    }
        /* Compare function for qsort()
        */
    static int tiny_cmp( const void *vl, const void *vr)
    {
    struct tinyhash *l = (struct tinyhash *)vl;
    struct tinyhash *r = (struct tinyhash *)vr;
    
    #if 0
    unsigned slot_l, slot_r;
    slot_l = tiny_hash(l->key) %THE_SIZE;
    slot_r = tiny_hash(r->key) %THE_SIZE;
    if (slot_l < slot_r ) return -3;
    if (slot_l > slot_r ) return 3;
    #else
    if (l->link < r->link ) return -3;
    if (l->link > r->link ) return 3;
    #endif
    
    if (l->key < r->key) return -2;
    if (l->key > r->key) return 2;
    
    if (l < r) return -1;
    if (l > r) return 1;
    
    return 0;
    }
    
        /* Stupid hashfunction, to be replaced with something usefull..
        ** (works good for random ints) Use at your own risk.
        */
    static unsigned tiny_hash(unsigned key)
    {
    return key * 98765;
    }
    
    void tiny_print(void)
    {
    unsigned idx;
    
    for (idx=0; idx < THE_SIZE; idx++ ) {
        unsigned slot;
        int dir;
        slot = tiny_hash(table[idx].key) % THE_SIZE;
        dir = (slot==idx) ? '=' : (slot>idx) ? '<':  '>';
        fprintf(stderr, "[%4x] %c %4x: %4x %c %10u->%3u\n"
        , idx, dir, 0xffff & slot
        , 0xffff & table[idx].link
        , table[idx].is_linked ? '*' : '.'
        , table[idx].key,table[idx].value
        );
        }
    }
        /* For testing: print the hashchains, construct a histogram of chainlengths,
        ** and calculate the "total cost of retrieval".
        */
    void tiny_count(void)
    {
    unsigned idx, cnt, len, tothops, slot;
    unsigned histogram[THE_SIZE];
    
    memset(histogram, 0, sizeof histogram);
    
    cnt=tothops=0;
    for (slot =0; slot < THE_SIZE; slot++ ) {
        idx = tiny_hash(table[slot].key) % THE_SIZE;
        if (slot!=idx) continue; /* this is not the start of a chain */
        for (len=0    ; idx != LINK_DEAD; idx = table[idx].link) {
            if (!table[idx].is_linked) continue;
            if (len==0) fprintf(stderr, "[%u]:", slot);
            fprintf(stderr, " %u", table[idx].key);
            len++;
            }
        fprintf(stderr, "(=%u)\n", len);
        cnt += len;
        histogram[len] += 1;
        tothops += (((len+1) *len)/2);
        }
    
    fprintf(stderr, "Histogram of chainlengths:\n");
    for (len=0; len < THE_SIZE; len++) {
        if (!histogram[len]) continue;
        fprintf(stderr, "[%u]: %u\n", len, histogram[len]);
        }
    fprintf(stderr, "tothops=%u/%u (%f hops per node)\n"
        , tothops, cnt, (double)tothops/cnt);
    }
    

    The result: (well: some of it)

    ....
    [978]: 1794172570(=1)
    [980]: 642121828(=1)
    [983]: 2674104091(=1)
    [985]: 547527125(=1)
    [986]: 2383911594(=1)
    [988]: 4254837348(=1)
    [989]: 1860851825 1990214465 1766290905(=3)
    [990]: 3793608270 469685686(=2)
    [992]: 1189958296 872917240(=2)
    [994]: 1999781290 1501026482(=2)
    [995]: 520334159 211600319(=2)
    [997]: 177583921(=1)
    [998]: 1173163646 1013572158(=2)
    [999]: 1705614211 3460318251(=2)
    Histogram of chainlengths:
    [1]: 369
    [2]: 190
    [3]: 57
    [4]: 15
    [5]: 4
    tothops=1491/1000 (1.491000 hops per node)
    

    Note: because of the sorting while initialising the hashtable, entries are very close to the place where they are expected. This increases the locality of reference.

    0 讨论(0)
  • 2020-11-30 12:51

    Keep your keys sorted, and use an M-tree to retrieve any key.

    An M-tree has M entry per node, instead of 2 for binaries. It will help tremendously for performance. Use a cache line size as the basis of your node size, hence 64 Bytes. You can store 16 32bits values in this size.

    Since you have 1000 values, 3 levels will be enough to retrieve the right key (as opposed to 10 levels for a binary tree).

    Another idea would be to hash your keys into a small hash-table, such as 12-bits one (4K entries), and solve the potential collisions with a simple chain. You are very likely to get most of your keys in a single search.

    0 讨论(0)
提交回复
热议问题