sort array of integers lexicographically C++

前端 未结 12 1512
野的像风
野的像风 2021-02-02 13:53

I want to sort a large array of integers (say 1 millon elements) lexicographically.

Example:

input [] = { 100, 21 , 22 , 99 , 1  , 927 }
sorted[] = { 1           


        
12条回答
  •  悲哀的现实
    2021-02-02 14:33

    Here is the dumb solution that doesn't use any floating point tricks. It's pretty much the same as the string comparison, but doesn't use a string per say, doesn't also handle negative numbers, to do that add a section at the top...

    bool comp(int l, int r)
    {
      int lv[10] = {}; // probably possible to get this from numeric_limits
      int rv[10] = {};
    
      int lc = 10; // ditto
      int rc = 10;
      while (l || r)
      {
        if (l)
        {
          auto t = l / 10;
          lv[--lc] = l - (t * 10);
          l = t;
        }
        if (r)
        {
          auto t = r / 10;
          rv[--rc] = r - (t * 10);
          r = t;
        }
      }
      while (lc < 10 && rc < 10)
      {
        if (lv[lc] == rv[rc])
        {
          lc++;
          rc++;
        }
        else
          return lv[lc] < rv[rc];
      }
      return lc > rc;
    }
    

    It's fast, and I'm sure it's possible to make it faster still, but it works and it's dumb enough to understand...

    EDIT: I ate to dump lots of code, but here is a comparison of all the solutions so far..

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    std::pair lexicographic_pair_helper(int p, int maxDigits)
    {
      int digits = std::log10(p);
      int l = p*std::pow(10, maxDigits-digits);
      return {l, p};
    }
    
    bool l_comp(int l, int r)
    {
      int lv[10] = {}; // probably possible to get this from numeric_limits
      int rv[10] = {};
    
      int lc = 10; // ditto
      int rc = 10;
      while (l || r)
      {
        if (l)
        {
          auto t = l / 10;
          lv[--lc] = l - (t * 10);
          l = t;
        }
        if (r)
        {
          auto t = r / 10;
          rv[--rc] = r - (t * 10);
          r = t;
        }
      }
      while (lc < 10 && rc < 10)
      {
        if (lv[lc] == rv[rc])
        {
          lc++;
          rc++;
        }
        else
          return lv[lc] < rv[rc];
      }
      return lc > rc;
    }
    
    int up_10pow(int n) {
      int ans = 1;
      while (ans < n) ans *= 10;
      return ans;
    }
    bool l_comp2(int v1, int v2) {
      int n1 = up_10pow(v1), n2 = up_10pow(v2);
      while ( v1 != 0 && v2 != 0) {
        if (v1 / n1  < v2 / n2) return true;
        else if (v1 / n1 > v2 / n2) return false;
        v1 /= 10;
        v2 /= 10;
        n1 /= 10;
        n2 /= 10;
      }
      if (v1 == 0 && v2 != 0) return true;
      return false;
    }
    
    int main()
    {
      std::vector numbers;
      {
        constexpr int number_of_elements = 1E6;
        std::random_device rd;
        std::mt19937 gen( rd() );
        std::uniform_int_distribution<> dist;
        for(int i = 0; i < number_of_elements; ++i) numbers.push_back( dist(gen) );
      }
    
      std::vector lo(numbers);
      std::vector dyp(numbers);
      std::vector nim(numbers);
      std::vector nb(numbers);
    
      std::cout << "starting..." << std::endl;
    
      {
    
        auto start = std::chrono::high_resolution_clock::now();
        /**
        * Sorts the array lexicographically.
        *
        * The trick is that we have to compare digits left-to-right
        * (considering typical Latin decimal notation) and that each of
        * two numbers to compare may have a different number of digits.
        *
        * This probably isn't very efficient, so I wouldn't do it on
        * "millions" of numbers. But, it works...
        */
        std::sort(
        std::begin(lo),
                  std::end(lo),
                  [](int lhs, int rhs) -> bool {
                    // Returns true if lhs < rhs
                    // Returns false otherwise
                    const auto BASE      = 10;
                    const bool LHS_FIRST = true;
                    const bool RHS_FIRST = false;
                    const bool EQUAL     = false;
    
    
                    // There's no point in doing anything at all
                    // if both inputs are the same; strict-weak
                    // ordering requires that we return `false`
                    // in this case.
                    if (lhs == rhs) {
                      return EQUAL;
                    }
    
                    // Compensate for sign
                    if (lhs < 0 && rhs < 0) {
                      // When both are negative, sign on its own yields
                      // no clear ordering between the two arguments.
                      //
                      // Remove the sign and continue as for positive
                      // numbers.
                      lhs *= -1;
                      rhs *= -1;
                    }
                    else if (lhs < 0) {
                      // When the LHS is negative but the RHS is not,
                  // consider the LHS "first" always as we wish to
                  // prioritise the leading '-'.
                  return LHS_FIRST;
                    }
                    else if (rhs < 0) {
                      // When the RHS is negative but the LHS is not,
                  // consider the RHS "first" always as we wish to
                  // prioritise the leading '-'.
                  return RHS_FIRST;
                    }
    
                    // Counting the number of digits in both the LHS and RHS
                    // arguments is *almost* trivial.
                    const auto lhs_digits = (
                    lhs == 0
                    ? 1
                    : std::ceil(std::log(lhs+1)/std::log(BASE))
                    );
    
                    const auto rhs_digits = (
                    rhs == 0
                    ? 1
                    : std::ceil(std::log(rhs+1)/std::log(BASE))
                    );
    
                    // Now we loop through the positions, left-to-right,
                  // calculating the digit at these positions for each
                  // input, and comparing them numerically. The
                  // lexicographic nature of the sorting comes from the
                  // fact that we are doing this per-digit comparison
                  // rather than considering the input value as a whole.
                  const auto max_pos = std::max(lhs_digits, rhs_digits);
                  for (auto pos = 0; pos < max_pos; pos++) {
                    if (lhs_digits - pos == 0) {
                      // Ran out of digits on the LHS;
                      // prioritise the shorter input
                      return LHS_FIRST;
                    }
                    else if (rhs_digits - pos == 0) {
                      // Ran out of digits on the RHS;
                      // prioritise the shorter input
                      return RHS_FIRST;
                    }
                    else {
                      const auto lhs_x = (lhs / static_cast(std::pow(BASE, lhs_digits - 1 - pos))) % BASE;
                      const auto rhs_x = (rhs / static_cast(std::pow(BASE, rhs_digits - 1 - pos))) % BASE;
    
                      if (lhs_x < rhs_x)
                        return LHS_FIRST;
                      else if (rhs_x < lhs_x)
                        return RHS_FIRST;
                    }
                  }
    
                  // If we reached the end and everything still
                  // matches up, then something probably went wrong
                  // as I'd have expected to catch this in the tests
                  // for equality.
                  assert("Unknown case encountered");
                  }
                  );
    
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = end - start;
        std::cout << "Lightness: " << elapsed.count() << '\n';
      }
    
      {
        auto start = std::chrono::high_resolution_clock::now();
    
        auto max = *std::max_element(begin(dyp), end(dyp));
        int maxDigits = std::log10(max);
    
        std::vector> temp;
        temp.reserve(dyp.size());
        for(auto const& e : dyp) temp.push_back( lexicographic_pair_helper(e, maxDigits) );
    
        std::sort(begin(temp), end(temp), [](std::pair const& l, std::pair const& r)
        { if(l.first < r.first) return true; if(l.first > r.first) return false; return l.second < r.second; });
    
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = end - start;
        std::cout << "Dyp: " << elapsed.count() << '\n';
      }
    
      {
        auto start = std::chrono::high_resolution_clock::now();
        std::sort (nim.begin(), nim.end(), l_comp);
        auto end = std::chrono::high_resolution_clock::now();
        auto elapsed = end - start;
        std::cout << "Nim: " << elapsed.count() << '\n';
      }
    
    //   {
    //     auto start = std::chrono::high_resolution_clock::now();
    //     std::sort (nb.begin(), nb.end(), l_comp2);
    //     auto end = std::chrono::high_resolution_clock::now();
    //     auto elapsed = end - start;
    //     std::cout << "notbad: " << elapsed.count() << '\n';
    //   }
    
      std::cout << (nim == lo) << std::endl;
      std::cout << (nim == dyp) << std::endl;
      std::cout << (lo == dyp) << std::endl;
    //   std::cout << (lo == nb) << std::endl;
    }
    

提交回复
热议问题