Data structure for matching sets

前端 未结 13 1152
有刺的猬
有刺的猬 2021-02-02 00:14

I have an application where I have a number of sets. A set might be
{4, 7, 12, 18}
unique numbers and all less than 50.

I then have several data items:
1 {1,

13条回答
  •  余生分开走
    2021-02-02 00:37

    The index of the sets that match the search criterion resemble the sets themselves. Instead of having the unique indexes less than 50, we have unique indexes less than 50000. Since you don't mind using a bit of memory, you can precompute matching sets in a 50 element array of 50000 bit integers. Then you index into the precomputed matches and basically just do your ((set & data) == set) but on the 50000 bit numbers which represent the matching sets. Here's what I mean.

    #include 
    
    enum
    {
        max_sets = 50000, // should be >= 64
        num_boxes = max_sets / 64 + 1,
        max_entry = 50
    };
    
    uint64_t sets_containing[max_entry][num_boxes];
    
    #define _(x) (uint64_t(1) << x)
    
    uint64_t sets[] =
    {
        _(1) | _(2) | _(4) | _(7) | _(8) | _(12) | _(18) | _(23) | _(29),
        _(3) | _(4) | _(6) | _(7) | _(15) | _(23) | _(34) | _(38),
        _(4) | _(7) | _(12) | _(18),
        _(1) | _(4) | _(7) | _(12) | _(13) | _(14) | _(15) | _(16) | _(17) | _(18),
        _(2) | _(4) | _(6) | _(7) | _(13) | _(15),
        0,
    };
    
    void big_and_equals(uint64_t lhs[num_boxes], uint64_t rhs[num_boxes])
    {
        static int comparison_counter = 0;
        for (int i = 0; i < num_boxes; ++i, ++comparison_counter)
        {
            lhs[i] &= rhs[i];
        }
        std::cout
            << "performed "
            << comparison_counter
            << " comparisons"
            << std::endl;
    }
    
    int main()
    {
        // Precompute matches
        memset(sets_containing, 0, sizeof(uint64_t) * max_entry * num_boxes);
    
        int set_number = 0;
        for (uint64_t* p = &sets[0]; *p; ++p, ++set_number)
        {
            int entry = 0;
            for (uint64_t set = *p; set; set >>= 1, ++entry)
            {
                if (set & 1)
                {
                    std::cout
                        << "sets_containing["
                        << entry
                        << "]["
                        << (set_number / 64)
                        << "] gets bit "
                        << set_number % 64
                        << std::endl;
    
                    uint64_t& flag_location =
                        sets_containing[entry][set_number / 64];
    
                    flag_location |= _(set_number % 64);
                }
            }
        }
    
        // Perform search for a key
        int key[] = {4, 7, 12, 18};
        uint64_t answer[num_boxes];
        memset(answer, 0xff, sizeof(uint64_t) * num_boxes);
    
        for (int i = 0; i < sizeof(key) / sizeof(key[0]); ++i)
        {
            big_and_equals(answer, sets_containing[key[i]]);
        }
    
        // Display the matches
        for (int set_number = 0; set_number < max_sets; ++set_number)
        {
            if (answer[set_number / 64] & _(set_number % 64))
            {
                std::cout
                    << "set "
                    << set_number
                    << " matches"
                    << std::endl;
            }
        }
    
        return 0;
    }
    

    Running this program yields:

    sets_containing[1][0] gets bit 0
    sets_containing[2][0] gets bit 0
    sets_containing[4][0] gets bit 0
    sets_containing[7][0] gets bit 0
    sets_containing[8][0] gets bit 0
    sets_containing[12][0] gets bit 0
    sets_containing[18][0] gets bit 0
    sets_containing[23][0] gets bit 0
    sets_containing[29][0] gets bit 0
    sets_containing[3][0] gets bit 1
    sets_containing[4][0] gets bit 1
    sets_containing[6][0] gets bit 1
    sets_containing[7][0] gets bit 1
    sets_containing[15][0] gets bit 1
    sets_containing[23][0] gets bit 1
    sets_containing[34][0] gets bit 1
    sets_containing[38][0] gets bit 1
    sets_containing[4][0] gets bit 2
    sets_containing[7][0] gets bit 2
    sets_containing[12][0] gets bit 2
    sets_containing[18][0] gets bit 2
    sets_containing[1][0] gets bit 3
    sets_containing[4][0] gets bit 3
    sets_containing[7][0] gets bit 3
    sets_containing[12][0] gets bit 3
    sets_containing[13][0] gets bit 3
    sets_containing[14][0] gets bit 3
    sets_containing[15][0] gets bit 3
    sets_containing[16][0] gets bit 3
    sets_containing[17][0] gets bit 3
    sets_containing[18][0] gets bit 3
    sets_containing[2][0] gets bit 4
    sets_containing[4][0] gets bit 4
    sets_containing[6][0] gets bit 4
    sets_containing[7][0] gets bit 4
    sets_containing[13][0] gets bit 4
    sets_containing[15][0] gets bit 4
    performed 782 comparisons
    performed 1564 comparisons
    performed 2346 comparisons
    performed 3128 comparisons
    set 0 matches
    set 2 matches
    set 3 matches
    

    3128 uint64_t comparisons beats 50000 comparisons so you win. Even in the worst case, which would be a key which has all 50 items, you only have to do num_boxes * max_entry comparisons which in this case is 39100. Still better than 50000.

提交回复
热议问题