sort array of integers lexicographically C++

前端未结

关注

 12  1512

野的像风 2021-02-02 13:53

I want to sort a large array of integers (say 1 millon elements) lexicographically.

Example:

input [] = { 100, 21 , 22 , 99 , 1  , 927 }
sorted[] = { 1


      
      
        
          12条回答        

        
                    
            
            
                         
                
              
              
                
                   悲哀的现实
                                             
                
                
                (楼主)
            
              
              
                2021-02-02 14:33
              

            
            
                        
Here is the dumb solution that doesn't use any floating point tricks. It's pretty much the same as the string comparison, but doesn't use a string per say, doesn't also handle negative numbers, to do that add a section at the top...

bool comp(int l, int r)
{
  int lv[10] = {}; // probably possible to get this from numeric_limits
  int rv[10] = {};

  int lc = 10; // ditto
  int rc = 10;
  while (l || r)
  {
    if (l)
    {
      auto t = l / 10;
      lv[--lc] = l - (t * 10);
      l = t;
    }
    if (r)
    {
      auto t = r / 10;
      rv[--rc] = r - (t * 10);
      r = t;
    }
  }
  while (lc < 10 && rc < 10)
  {
    if (lv[lc] == rv[rc])
    {
      lc++;
      rc++;
    }
    else
      return lv[lc] < rv[rc];
  }
  return lc > rc;
}


It's fast, and I'm sure it's possible to make it faster still, but it works and it's dumb enough to understand...

EDIT: I ate to dump lots of code, but here is a comparison of all the solutions so far..

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

std::pair lexicographic_pair_helper(int p, int maxDigits)
{
  int digits = std::log10(p);
  int l = p*std::pow(10, maxDigits-digits);
  return {l, p};
}

bool l_comp(int l, int r)
{
  int lv[10] = {}; // probably possible to get this from numeric_limits
  int rv[10] = {};

  int lc = 10; // ditto
  int rc = 10;
  while (l || r)
  {
    if (l)
    {
      auto t = l / 10;
      lv[--lc] = l - (t * 10);
      l = t;
    }
    if (r)
    {
      auto t = r / 10;
      rv[--rc] = r - (t * 10);
      r = t;
    }
  }
  while (lc < 10 && rc < 10)
  {
    if (lv[lc] == rv[rc])
    {
      lc++;
      rc++;
    }
    else
      return lv[lc] < rv[rc];
  }
  return lc > rc;
}

int up_10pow(int n) {
  int ans = 1;
  while (ans < n) ans *= 10;
  return ans;
}
bool l_comp2(int v1, int v2) {
  int n1 = up_10pow(v1), n2 = up_10pow(v2);
  while ( v1 != 0 && v2 != 0) {
    if (v1 / n1  < v2 / n2) return true;
    else if (v1 / n1 > v2 / n2) return false;
    v1 /= 10;
    v2 /= 10;
    n1 /= 10;
    n2 /= 10;
  }
  if (v1 == 0 && v2 != 0) return true;
  return false;
}

int main()
{
  std::vector numbers;
  {
    constexpr int number_of_elements = 1E6;
    std::random_device rd;
    std::mt19937 gen( rd() );
    std::uniform_int_distribution<> dist;
    for(int i = 0; i < number_of_elements; ++i) numbers.push_back( dist(gen) );
  }

  std::vector lo(numbers);
  std::vector dyp(numbers);
  std::vector nim(numbers);
  std::vector nb(numbers);

  std::cout << "starting..." << std::endl;

  {

    auto start = std::chrono::high_resolution_clock::now();
    /**
    * Sorts the array lexicographically.
    *
    * The trick is that we have to compare digits left-to-right
    * (considering typical Latin decimal notation) and that each of
    * two numbers to compare may have a different number of digits.
    *
    * This probably isn't very efficient, so I wouldn't do it on
    * "millions" of numbers. But, it works...
    */
    std::sort(
    std::begin(lo),
              std::end(lo),
              [](int lhs, int rhs) -> bool {
                // Returns true if lhs < rhs
                // Returns false otherwise
                const auto BASE      = 10;
                const bool LHS_FIRST = true;
                const bool RHS_FIRST = false;
                const bool EQUAL     = false;


                // There's no point in doing anything at all
                // if both inputs are the same; strict-weak
                // ordering requires that we return `false`
                // in this case.
                if (lhs == rhs) {
                  return EQUAL;
                }

                // Compensate for sign
                if (lhs < 0 && rhs < 0) {
                  // When both are negative, sign on its own yields
                  // no clear ordering between the two arguments.
                  //
                  // Remove the sign and continue as for positive
                  // numbers.
                  lhs *= -1;
                  rhs *= -1;
                }
                else if (lhs < 0) {
                  // When the LHS is negative but the RHS is not,
              // consider the LHS "first" always as we wish to
              // prioritise the leading '-'.
              return LHS_FIRST;
                }
                else if (rhs < 0) {
                  // When the RHS is negative but the LHS is not,
              // consider the RHS "first" always as we wish to
              // prioritise the leading '-'.
              return RHS_FIRST;
                }

                // Counting the number of digits in both the LHS and RHS
                // arguments is *almost* trivial.
                const auto lhs_digits = (
                lhs == 0
                ? 1
                : std::ceil(std::log(lhs+1)/std::log(BASE))
                );

                const auto rhs_digits = (
                rhs == 0
                ? 1
                : std::ceil(std::log(rhs+1)/std::log(BASE))
                );

                // Now we loop through the positions, left-to-right,
              // calculating the digit at these positions for each
              // input, and comparing them numerically. The
              // lexicographic nature of the sorting comes from the
              // fact that we are doing this per-digit comparison
              // rather than considering the input value as a whole.
              const auto max_pos = std::max(lhs_digits, rhs_digits);
              for (auto pos = 0; pos < max_pos; pos++) {
                if (lhs_digits - pos == 0) {
                  // Ran out of digits on the LHS;
                  // prioritise the shorter input
                  return LHS_FIRST;
                }
                else if (rhs_digits - pos == 0) {
                  // Ran out of digits on the RHS;
                  // prioritise the shorter input
                  return RHS_FIRST;
                }
                else {
                  const auto lhs_x = (lhs / static_cast(std::pow(BASE, lhs_digits - 1 - pos))) % BASE;
                  const auto rhs_x = (rhs / static_cast(std::pow(BASE, rhs_digits - 1 - pos))) % BASE;

                  if (lhs_x < rhs_x)
                    return LHS_FIRST;
                  else if (rhs_x < lhs_x)
                    return RHS_FIRST;
                }
              }

              // If we reached the end and everything still
              // matches up, then something probably went wrong
              // as I'd have expected to catch this in the tests
              // for equality.
              assert("Unknown case encountered");
              }
              );

    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed = end - start;
    std::cout << "Lightness: " << elapsed.count() << '\n';
  }

  {
    auto start = std::chrono::high_resolution_clock::now();

    auto max = *std::max_element(begin(dyp), end(dyp));
    int maxDigits = std::log10(max);

    std::vector> temp;
    temp.reserve(dyp.size());
    for(auto const& e : dyp) temp.push_back( lexicographic_pair_helper(e, maxDigits) );

    std::sort(begin(temp), end(temp), [](std::pair const& l, std::pair const& r)
    { if(l.first < r.first) return true; if(l.first > r.first) return false; return l.second < r.second; });

    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed = end - start;
    std::cout << "Dyp: " << elapsed.count() << '\n';
  }

  {
    auto start = std::chrono::high_resolution_clock::now();
    std::sort (nim.begin(), nim.end(), l_comp);
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed = end - start;
    std::cout << "Nim: " << elapsed.count() << '\n';
  }

//   {
//     auto start = std::chrono::high_resolution_clock::now();
//     std::sort (nb.begin(), nb.end(), l_comp2);
//     auto end = std::chrono::high_resolution_clock::now();
//     auto elapsed = end - start;
//     std::cout << "notbad: " << elapsed.count() << '\n';
//   }

  std::cout << (nim == lo) << std::endl;
  std::cout << (nim == dyp) << std::endl;
  std::cout << (lo == dyp) << std::endl;
//   std::cout << (lo == nb) << std::endl;
}

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它12个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复