What is the best way to create a sparse array in C++?

后端 未结 11 2136
心在旅途
心在旅途 2020-11-29 01:45

I am working on a project that requires the manipulation of enormous matrices, specifically pyramidal summation for a copula calculation.

In short, I need to keep

相关标签:
11条回答
  • 2020-11-29 02:00

    The best way to implement sparse matrices is to not to implement them - atleast not on your own. I would suggest to BLAS (which I think is a part of LAPACK) which can handle really huge matrices.

    0 讨论(0)
  • 2020-11-29 02:04

    For C++, a map works well. Several million objects won't be a problem. 10 million items took about 4.4 seconds and about 57 meg on my computer.

    My test application is as follows:

    #include <stdio.h>
    #include <stdlib.h>
    #include <map>
    
    class triple {
    public:
        int x;
        int y;
        int z;
        bool operator<(const triple &other) const {
            if (x < other.x) return true;
            if (other.x < x) return false;
            if (y < other.y) return true;
            if (other.y < y) return false;
            return z < other.z;
        }
    };
    
    int main(int, char**)
    {
        std::map<triple,int> data;
        triple point;
        int i;
    
        for (i = 0; i < 10000000; ++i) {
            point.x = rand();
            point.y = rand();
            point.z = rand();
            //printf("%d %d %d %d\n", i, point.x, point.y, point.z);
            data[point] = i;
        }
        return 0;
    }
    

    Now to dynamically choose the number of variables, the easiest solution is to represent index as a string, and then use string as a key for the map. For instance, an item located at [23][55] can be represented via "23,55" string. We can also extend this solution for higher dimensions; such as for three dimensions an arbitrary index will look like "34,45,56". A simple implementation of this technique is as follows:

    std::map data<string,int> data;
    char ix[100];
    
    sprintf(ix, "%d,%d", x, y); // 2 vars
    data[ix] = i;
    
    sprintf(ix, "%d,%d,%d", x, y, z); // 3 vars
    data[ix] = i;
    
    0 讨论(0)
  • 2020-11-29 02:05

    As a general advice, a method using strings as indices is actually very slow. A much more efficient but otherwise equivalent solution would be to use vectors/arrays. There's absolutely no need to write the indices in a string.

    typedef vector<size_t> index_t;
    
    struct index_cmp_t : binary_function<index_t, index_t, bool> {
        bool operator ()(index_t const& a, index_t const& b) const {
            for (index_t::size_type i = 0; i < a.size(); ++i)
                if (a[i] != b[i])
                    return a[i] < b[i];
            return false;
        }
    };
    
    map<index_t, int, index_cmp_t> data;
    index_t i(dims);
    i[0] = 1;
    i[1] = 2;
    // … etc.
    data[i] = 42;
    

    However, using a map in practice often isn't very efficient because of the implementation in terms of a balanced binary search tree. A better performing data structure in this case would be a hash table, as provided by std::unordered_map.

    0 讨论(0)
  • 2020-11-29 02:11

    Hash tables have a fast insertion and look up. You could write a simple hash function since you know you'd be dealing with only integer pairs as the keys.

    0 讨论(0)
  • 2020-11-29 02:13

    Here is a relatively simple implementation that should provide a reasonable fast lookup (using a hash table) as well as fast iteration over non-zero elements in a row/column.

    // Copyright 2014 Leo Osvald
    //
    // Licensed under the Apache License, Version 2.0 (the "License");
    // you may not use this file except in compliance with the License.
    // You may obtain a copy of the License at
    //
    //     http://www.apache.org/licenses/LICENSE-2.0
    //
    // Unless required by applicable law or agreed to in writing, software
    // distributed under the License is distributed on an "AS IS" BASIS,
    // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    // See the License for the specific language governing permissions and
    // limitations under the License.
    
    #ifndef UTIL_IMMUTABLE_SPARSE_MATRIX_HPP_
    #define UTIL_IMMUTABLE_SPARSE_MATRIX_HPP_
    
    #include <algorithm>
    #include <limits>
    #include <map>
    #include <type_traits>
    #include <unordered_map>
    #include <utility>
    #include <vector>
    
    // A simple time-efficient implementation of an immutable sparse matrix
    // Provides efficient iteration of non-zero elements by rows/cols,
    // e.g. to iterate over a range [row_from, row_to) x [col_from, col_to):
    //   for (int row = row_from; row < row_to; ++row) {
    //     for (auto col_range = sm.nonzero_col_range(row, col_from, col_to);
    //          col_range.first != col_range.second; ++col_range.first) {
    //       int col = *col_range.first;
    //       // use sm(row, col)
    //       ...
    //     }
    template<typename T = double, class Coord = int>
    class SparseMatrix {
      struct PointHasher;
      typedef std::map< Coord, std::vector<Coord> > NonZeroList;
      typedef std::pair<Coord, Coord> Point;
    
     public:
      typedef T ValueType;
      typedef Coord CoordType;
      typedef typename NonZeroList::mapped_type::const_iterator CoordIter;
      typedef std::pair<CoordIter, CoordIter> CoordIterRange;
    
      SparseMatrix() = default;
    
      // Reads a matrix stored in MatrixMarket-like format, i.e.:
      // <num_rows> <num_cols> <num_entries>
      // <row_1> <col_1> <val_1>
      // ...
      // Note: the header (lines starting with '%' are ignored).
      template<class InputStream, size_t max_line_length = 1024>
      void Init(InputStream& is) {
        rows_.clear(), cols_.clear();
        values_.clear();
    
        // skip the header (lines beginning with '%', if any)
        decltype(is.tellg()) offset = 0;
        for (char buf[max_line_length + 1];
             is.getline(buf, sizeof(buf)) && buf[0] == '%'; )
          offset = is.tellg();
        is.seekg(offset);
    
        size_t n;
        is >> row_count_ >> col_count_ >> n;
        values_.reserve(n);
        while (n--) {
          Coord row, col;
          typename std::remove_cv<T>::type val;
          is >> row >> col >> val;
          values_[Point(--row, --col)] = val;
          rows_[col].push_back(row);
          cols_[row].push_back(col);
        }
        SortAndShrink(rows_);
        SortAndShrink(cols_);
      }
    
      const T& operator()(const Coord& row, const Coord& col) const {
        static const T kZero = T();
        auto it = values_.find(Point(row, col));
        if (it != values_.end())
          return it->second;
        return kZero;
      }
    
      CoordIterRange
      nonzero_col_range(Coord row, Coord col_from, Coord col_to) const {
        CoordIterRange r;
        GetRange(cols_, row, col_from, col_to, &r);
        return r;
      }
    
      CoordIterRange
      nonzero_row_range(Coord col, Coord row_from, Coord row_to) const {
        CoordIterRange r;
        GetRange(rows_, col, row_from, row_to, &r);
        return r;
      }
    
      Coord row_count() const { return row_count_; }
      Coord col_count() const { return col_count_; }
      size_t nonzero_count() const { return values_.size(); }
      size_t element_count() const { return size_t(row_count_) * col_count_; }
    
     private:
      typedef std::unordered_map<Point,
                                 typename std::remove_cv<T>::type,
                                 PointHasher> ValueMap;
    
      struct PointHasher {
        size_t operator()(const Point& p) const {
          return p.first << (std::numeric_limits<Coord>::digits >> 1) ^ p.second;
        }
      };
    
      static void SortAndShrink(NonZeroList& list) {
        for (auto& it : list) {
          auto& indices = it.second;
          indices.shrink_to_fit();
          std::sort(indices.begin(), indices.end());
        }
    
        // insert a sentinel vector to handle the case of all zeroes
        if (list.empty())
          list.emplace(Coord(), std::vector<Coord>(Coord()));
      }
    
      static void GetRange(const NonZeroList& list, Coord i, Coord from, Coord to,
                           CoordIterRange* r) {
        auto lr = list.equal_range(i);
        if (lr.first == lr.second) {
          r->first = r->second = list.begin()->second.end();
          return;
        }
    
        auto begin = lr.first->second.begin(), end = lr.first->second.end();
        r->first = lower_bound(begin, end, from);
        r->second = lower_bound(r->first, end, to);
      }
    
      ValueMap values_;
      NonZeroList rows_, cols_;
      Coord row_count_, col_count_;
    };
    
    #endif  /* UTIL_IMMUTABLE_SPARSE_MATRIX_HPP_ */
    

    For simplicity, it's immutable, but you can can make it mutable; be sure to change std::vector to std::set if you want a reasonable efficient "insertions" (changing a zero to a non-zero).

    0 讨论(0)
提交回复
热议问题