Natural Sort Order in C#

后端 未结 17 1997
野性不改
野性不改 2020-11-21 04:54

Anyone have a good resource or provide a sample of a natural order sort in C# for an FileInfo array? I am implementing the IComparer interface in

相关标签:
17条回答
  • 2020-11-21 05:38

    Here's a relatively simple example that doesn't use P/Invoke and avoids any allocation during execution.

    internal sealed class NumericStringComparer : IComparer<string>
    {
        public static NumericStringComparer Instance { get; } = new NumericStringComparer();
    
        public int Compare(string x, string y)
        {
            // sort nulls to the start
            if (x == null)
                return y == null ? 0 : -1;
            if (y == null)
                return 1;
    
            var ix = 0;
            var iy = 0;
    
            while (true)
            {
                // sort shorter strings to the start
                if (ix >= x.Length)
                    return iy >= y.Length ? 0 : -1;
                if (iy >= y.Length)
                    return 1;
    
                var cx = x[ix];
                var cy = y[iy];
    
                int result;
                if (char.IsDigit(cx) && char.IsDigit(cy))
                    result = CompareInteger(x, y, ref ix, ref iy);
                else
                    result = cx.CompareTo(y[iy]);
    
                if (result != 0)
                    return result;
    
                ix++;
                iy++;
            }
        }
    
        private static int CompareInteger(string x, string y, ref int ix, ref int iy)
        {
            var lx = GetNumLength(x, ix);
            var ly = GetNumLength(y, iy);
    
            // shorter number first (note, doesn't handle leading zeroes)
            if (lx != ly)
                return lx.CompareTo(ly);
    
            for (var i = 0; i < lx; i++)
            {
                var result = x[ix++].CompareTo(y[iy++]);
                if (result != 0)
                    return result;
            }
    
            return 0;
        }
    
        private static int GetNumLength(string s, int i)
        {
            var length = 0;
            while (i < s.Length && char.IsDigit(s[i++]))
                length++;
            return length;
        }
    }
    

    It doesn't ignore leading zeroes, so 01 comes after 2.

    Corresponding unit test:

    public class NumericStringComparerTests
    {
        [Fact]
        public void OrdersCorrectly()
        {
            AssertEqual("", "");
            AssertEqual(null, null);
            AssertEqual("Hello", "Hello");
            AssertEqual("Hello123", "Hello123");
            AssertEqual("123", "123");
            AssertEqual("123Hello", "123Hello");
    
            AssertOrdered("", "Hello");
            AssertOrdered(null, "Hello");
            AssertOrdered("Hello", "Hello1");
            AssertOrdered("Hello123", "Hello124");
            AssertOrdered("Hello123", "Hello133");
            AssertOrdered("Hello123", "Hello223");
            AssertOrdered("123", "124");
            AssertOrdered("123", "133");
            AssertOrdered("123", "223");
            AssertOrdered("123", "1234");
            AssertOrdered("123", "2345");
            AssertOrdered("0", "1");
            AssertOrdered("123Hello", "124Hello");
            AssertOrdered("123Hello", "133Hello");
            AssertOrdered("123Hello", "223Hello");
            AssertOrdered("123Hello", "1234Hello");
        }
    
        private static void AssertEqual(string x, string y)
        {
            Assert.Equal(0, NumericStringComparer.Instance.Compare(x, y));
            Assert.Equal(0, NumericStringComparer.Instance.Compare(y, x));
        }
    
        private static void AssertOrdered(string x, string y)
        {
            Assert.Equal(-1, NumericStringComparer.Instance.Compare(x, y));
            Assert.Equal( 1, NumericStringComparer.Instance.Compare(y, x));
        }
    }
    
    0 讨论(0)
  • 2020-11-21 05:39

    Here is a naive one-line regex-less LINQ way (borrowed from python):

    var alphaStrings = new List<string>() { "10","2","3","4","50","11","100","a12","b12" };
    var orderedString = alphaStrings.OrderBy(g => new Tuple<int, string>(g.ToCharArray().All(char.IsDigit)? int.Parse(g) : int.MaxValue, g));
    // Order Now: ["2","3","4","10","11","50","100","a12","b12"]
    
    0 讨论(0)
  • 2020-11-21 05:44

    None of the existing implementations looked great so I wrote my own. The results are almost identical to the sorting used by modern versions of Windows Explorer (Windows 7/8). The only differences I've seen are 1) although Windows used to (e.g. XP) handle numbers of any length, it's now limited to 19 digits - mine is unlimited, 2) Windows gives inconsistent results with certain sets of Unicode digits - mine works fine (although it doesn't numerically compare digits from surrogate pairs; nor does Windows), and 3) mine can't distinguish different types of non-primary sort weights if they occur in different sections (e.g. "e-1é" vs "é1e-" - the sections before and after the number have diacritic and punctuation weight differences).

    public static int CompareNatural(string strA, string strB) {
        return CompareNatural(strA, strB, CultureInfo.CurrentCulture, CompareOptions.IgnoreCase);
    }
    
    public static int CompareNatural(string strA, string strB, CultureInfo culture, CompareOptions options) {
        CompareInfo cmp = culture.CompareInfo;
        int iA = 0;
        int iB = 0;
        int softResult = 0;
        int softResultWeight = 0;
        while (iA < strA.Length && iB < strB.Length) {
            bool isDigitA = Char.IsDigit(strA[iA]);
            bool isDigitB = Char.IsDigit(strB[iB]);
            if (isDigitA != isDigitB) {
                return cmp.Compare(strA, iA, strB, iB, options);
            }
            else if (!isDigitA && !isDigitB) {
                int jA = iA + 1;
                int jB = iB + 1;
                while (jA < strA.Length && !Char.IsDigit(strA[jA])) jA++;
                while (jB < strB.Length && !Char.IsDigit(strB[jB])) jB++;
                int cmpResult = cmp.Compare(strA, iA, jA - iA, strB, iB, jB - iB, options);
                if (cmpResult != 0) {
                    // Certain strings may be considered different due to "soft" differences that are
                    // ignored if more significant differences follow, e.g. a hyphen only affects the
                    // comparison if no other differences follow
                    string sectionA = strA.Substring(iA, jA - iA);
                    string sectionB = strB.Substring(iB, jB - iB);
                    if (cmp.Compare(sectionA + "1", sectionB + "2", options) ==
                        cmp.Compare(sectionA + "2", sectionB + "1", options))
                    {
                        return cmp.Compare(strA, iA, strB, iB, options);
                    }
                    else if (softResultWeight < 1) {
                        softResult = cmpResult;
                        softResultWeight = 1;
                    }
                }
                iA = jA;
                iB = jB;
            }
            else {
                char zeroA = (char)(strA[iA] - (int)Char.GetNumericValue(strA[iA]));
                char zeroB = (char)(strB[iB] - (int)Char.GetNumericValue(strB[iB]));
                int jA = iA;
                int jB = iB;
                while (jA < strA.Length && strA[jA] == zeroA) jA++;
                while (jB < strB.Length && strB[jB] == zeroB) jB++;
                int resultIfSameLength = 0;
                do {
                    isDigitA = jA < strA.Length && Char.IsDigit(strA[jA]);
                    isDigitB = jB < strB.Length && Char.IsDigit(strB[jB]);
                    int numA = isDigitA ? (int)Char.GetNumericValue(strA[jA]) : 0;
                    int numB = isDigitB ? (int)Char.GetNumericValue(strB[jB]) : 0;
                    if (isDigitA && (char)(strA[jA] - numA) != zeroA) isDigitA = false;
                    if (isDigitB && (char)(strB[jB] - numB) != zeroB) isDigitB = false;
                    if (isDigitA && isDigitB) {
                        if (numA != numB && resultIfSameLength == 0) {
                            resultIfSameLength = numA < numB ? -1 : 1;
                        }
                        jA++;
                        jB++;
                    }
                }
                while (isDigitA && isDigitB);
                if (isDigitA != isDigitB) {
                    // One number has more digits than the other (ignoring leading zeros) - the longer
                    // number must be larger
                    return isDigitA ? 1 : -1;
                }
                else if (resultIfSameLength != 0) {
                    // Both numbers are the same length (ignoring leading zeros) and at least one of
                    // the digits differed - the first difference determines the result
                    return resultIfSameLength;
                }
                int lA = jA - iA;
                int lB = jB - iB;
                if (lA != lB) {
                    // Both numbers are equivalent but one has more leading zeros
                    return lA > lB ? -1 : 1;
                }
                else if (zeroA != zeroB && softResultWeight < 2) {
                    softResult = cmp.Compare(strA, iA, 1, strB, iB, 1, options);
                    softResultWeight = 2;
                }
                iA = jA;
                iB = jB;
            }
        }
        if (iA < strA.Length || iB < strB.Length) {
            return iA < strA.Length ? 1 : -1;
        }
        else if (softResult != 0) {
            return softResult;
        }
        return 0;
    }
    

    The signature matches the Comparison<string> delegate:

    string[] files = Directory.GetFiles(@"C:\");
    Array.Sort(files, CompareNatural);
    

    Here's a wrapper class for use as IComparer<string>:

    public class CustomComparer<T> : IComparer<T> {
        private Comparison<T> _comparison;
    
        public CustomComparer(Comparison<T> comparison) {
            _comparison = comparison;
        }
    
        public int Compare(T x, T y) {
            return _comparison(x, y);
        }
    }
    

    Example:

    string[] files = Directory.EnumerateFiles(@"C:\")
        .OrderBy(f => f, new CustomComparer<string>(CompareNatural))
        .ToArray();
    

    Here's a good set of filenames I use for testing:

    Func<string, string> expand = (s) => { int o; while ((o = s.IndexOf('\\')) != -1) { int p = o + 1;
        int z = 1; while (s[p] == '0') { z++; p++; } int c = Int32.Parse(s.Substring(p, z));
        s = s.Substring(0, o) + new string(s[o - 1], c) + s.Substring(p + z); } return s; };
    string encodedFileNames =
        "KDEqLW4xMiotbjEzKjAwMDFcMDY2KjAwMlwwMTcqMDA5XDAxNyowMlwwMTcqMDlcMDE3KjEhKjEtISox" +
        "LWEqMS4yNT8xLjI1KjEuNT8xLjUqMSoxXDAxNyoxXDAxOCoxXDAxOSoxXDA2NioxXDA2NyoxYSoyXDAx" +
        "NyoyXDAxOCo5XDAxNyo5XDAxOCo5XDA2Nio9MSphMDAxdGVzdDAxKmEwMDF0ZXN0aW5nYTBcMzEqYTAw" +
        "Mj9hMDAyIGE/YTAwMiBhKmEwMDIqYTAwMmE/YTAwMmEqYTAxdGVzdGluZ2EwMDEqYTAxdnNmcyphMSph" +
        "MWEqYTF6KmEyKmIwMDAzcTYqYjAwM3E0KmIwM3E1KmMtZSpjZCpjZipmIDEqZipnP2cgMT9oLW4qaG8t" +
        "bipJKmljZS1jcmVhbT9pY2VjcmVhbT9pY2VjcmVhbS0/ajBcNDE/ajAwMWE/ajAxP2shKmsnKmstKmsx" +
        "KmthKmxpc3QqbTAwMDNhMDA1YSptMDAzYTAwMDVhKm0wMDNhMDA1Km0wMDNhMDA1YSpuMTIqbjEzKm8t" +
        "bjAxMypvLW4xMipvLW40P28tbjQhP28tbjR6P28tbjlhLWI1Km8tbjlhYjUqb24wMTMqb24xMipvbjQ/" +
        "b240IT9vbjR6P29uOWEtYjUqb245YWI1Km/CrW4wMTMqb8KtbjEyKnAwMCpwMDEqcDAxwr0hKnAwMcK9" +
        "KnAwMcK9YSpwMDHCvcK+KnAwMipwMMK9KnEtbjAxMypxLW4xMipxbjAxMypxbjEyKnItMDAhKnItMDAh" +
        "NSpyLTAwIe+8lSpyLTAwYSpyLe+8kFwxIS01KnIt77yQXDEhLe+8lSpyLe+8kFwxISpyLe+8kFwxITUq" +
        "ci3vvJBcMSHvvJUqci3vvJBcMWEqci3vvJBcMyE1KnIwMCEqcjAwLTUqcjAwLjUqcjAwNSpyMDBhKnIw" +
        "NSpyMDYqcjQqcjUqctmg2aYqctmkKnLZpSpy27Dbtipy27Qqctu1KnLfgN+GKnLfhCpy34UqcuClpuCl" +
        "rCpy4KWqKnLgpasqcuCnpuCnrCpy4KeqKnLgp6sqcuCppuCprCpy4KmqKnLgqasqcuCrpuCrrCpy4Kuq" +
        "KnLgq6sqcuCtpuCtrCpy4K2qKnLgrasqcuCvpuCvrCpy4K+qKnLgr6sqcuCxpuCxrCpy4LGqKnLgsasq" +
        "cuCzpuCzrCpy4LOqKnLgs6sqcuC1puC1rCpy4LWqKnLgtasqcuC5kOC5lipy4LmUKnLguZUqcuC7kOC7" +
        "lipy4LuUKnLgu5UqcuC8oOC8pipy4LykKnLgvKUqcuGBgOGBhipy4YGEKnLhgYUqcuGCkOGClipy4YKU" +
        "KnLhgpUqcuGfoOGfpipy4Z+kKnLhn6UqcuGgkOGglipy4aCUKnLhoJUqcuGlhuGljCpy4aWKKnLhpYsq" +
        "cuGnkOGnlipy4aeUKnLhp5UqcuGtkOGtlipy4a2UKnLhrZUqcuGusOGutipy4a60KnLhrrUqcuGxgOGx" +
        "hipy4bGEKnLhsYUqcuGxkOGxlipy4bGUKnLhsZUqcuqYoFwx6pilKnLqmKDqmKUqcuqYoOqYpipy6pik" +
        "KnLqmKUqcuqjkOqjlipy6qOUKnLqo5UqcuqkgOqkhipy6qSEKnLqpIUqcuqpkOqplipy6qmUKnLqqZUq" +
        "cvCQkqAqcvCQkqUqcvCdn5gqcvCdn50qcu+8kFwxISpy77yQXDEt77yVKnLvvJBcMS7vvJUqcu+8kFwx" +
        "YSpy77yQXDHqmKUqcu+8kFwx77yO77yVKnLvvJBcMe+8lSpy77yQ77yVKnLvvJDvvJYqcu+8lCpy77yV" +
        "KnNpKnPEsSp0ZXN02aIqdGVzdNmi2aAqdGVzdNmjKnVBZS0qdWFlKnViZS0qdUJlKnVjZS0xw6kqdWNl" +
        "McOpLSp1Y2Uxw6kqdWPDqS0xZSp1Y8OpMWUtKnVjw6kxZSp3ZWlhMSp3ZWlhMip3ZWlzczEqd2Vpc3My" +
        "KndlaXoxKndlaXoyKndlacOfMSp3ZWnDnzIqeSBhMyp5IGE0KnknYTMqeSdhNCp5K2EzKnkrYTQqeS1h" +
        "Myp5LWE0KnlhMyp5YTQqej96IDA1MD96IDIxP3ohMjE/ejIwP3oyMj96YTIxP3rCqTIxP1sxKl8xKsKt" +
        "bjEyKsKtbjEzKsSwKg==";
    string[] fileNames = Encoding.UTF8.GetString(Convert.FromBase64String(encodedFileNames))
        .Replace("*", ".txt?").Split(new[] { "?" }, StringSplitOptions.RemoveEmptyEntries)
        .Select(n => expand(n)).ToArray();
    
    0 讨论(0)
  • 2020-11-21 05:46

    The easiest thing to do is just P/Invoke the built-in function in Windows, and use it as the comparison function in your IComparer:

    [DllImport("shlwapi.dll", CharSet = CharSet.Unicode)]
    private static extern int StrCmpLogicalW(string psz1, string psz2);
    

    Michael Kaplan has some examples of how this function works here, and the changes that were made for Vista to make it work more intuitively. The plus side of this function is that it will have the same behaviour as the version of Windows it runs on, however this does mean that it differs between versions of Windows so you need to consider whether this is a problem for you.

    So a complete implementation would be something like:

    [SuppressUnmanagedCodeSecurity]
    internal static class SafeNativeMethods
    {
        [DllImport("shlwapi.dll", CharSet = CharSet.Unicode)]
        public static extern int StrCmpLogicalW(string psz1, string psz2);
    }
    
    public sealed class NaturalStringComparer : IComparer<string>
    {
        public int Compare(string a, string b)
        {
            return SafeNativeMethods.StrCmpLogicalW(a, b);
        }
    }
    
    public sealed class NaturalFileInfoNameComparer : IComparer<FileInfo>
    {
        public int Compare(FileInfo a, FileInfo b)
        {
            return SafeNativeMethods.StrCmpLogicalW(a.Name, b.Name);
        }
    }
    
    0 讨论(0)
  • 2020-11-21 05:46

    We had a need for a natural sort to deal with text with the following pattern:

    "Test 1-1-1 something"
    "Test 1-2-3 something"
    ...
    

    For some reason when I first looked on SO, I didn't find this post and implemented our own. Compared to some of the solutions presented here, while similar in concept, it could have the benefit of maybe being simpler and easier to understand. However, while I did try to look at performance bottlenecks, It is still a much slower implementation than the default OrderBy().

    Here is the extension method I implement:

    public static class EnumerableExtensions
    {
        // set up the regex parser once and for all
        private static readonly Regex Regex = new Regex(@"\d+|\D+", RegexOptions.Compiled | RegexOptions.Singleline);
    
        // stateless comparer can be built once
        private static readonly AggregateComparer Comparer = new AggregateComparer();
    
        public static IEnumerable<T> OrderByNatural<T>(this IEnumerable<T> source, Func<T, string> selector)
        {
            // first extract string from object using selector
            // then extract digit and non-digit groups
            Func<T, IEnumerable<IComparable>> splitter =
                s => Regex.Matches(selector(s))
                          .Cast<Match>()
                          .Select(m => Char.IsDigit(m.Value[0]) ? (IComparable) int.Parse(m.Value) : m.Value);
            return source.OrderBy(splitter, Comparer);
        }
    
        /// <summary>
        /// This comparer will compare two lists of objects against each other
        /// </summary>
        /// <remarks>Objects in each list are compare to their corresponding elements in the other
        /// list until a difference is found.</remarks>
        private class AggregateComparer : IComparer<IEnumerable<IComparable>>
        {
            public int Compare(IEnumerable<IComparable> x, IEnumerable<IComparable> y)
            {
                return
                    x.Zip(y, (a, b) => new {a, b})              // walk both lists
                     .Select(pair => pair.a.CompareTo(pair.b))  // compare each object
                     .FirstOrDefault(result => result != 0);    // until a difference is found
            }
        }
    }
    

    The idea is to split the original strings into blocks of digits and non-digits ("\d+|\D+"). Since this is a potentially expensive task, it is done only once per entry. We then use a comparer of comparable objects (sorry, I can't find a more proper way to say it). It compares each block to its corresponding block in the other string.

    I would like feedback on how this could be improved and what the major flaws are. Note that maintainability is important to us at this point and we are not currently using this in extremely large data sets.

    0 讨论(0)
  • 2020-11-21 05:47

    A version that's easier to read/maintain.

    public class NaturalStringComparer : IComparer<string>
    {
        public static NaturalStringComparer Instance { get; } = new NaturalStringComparer();
    
        public int Compare(string x, string y) {
            const int LeftIsSmaller = -1;
            const int RightIsSmaller = 1;
            const int Equal = 0;
    
            var leftString = x;
            var rightString = y;
    
            var stringComparer = CultureInfo.CurrentCulture.CompareInfo;
    
            int rightIndex;
            int leftIndex;
    
            for (leftIndex = 0, rightIndex = 0;
                 leftIndex < leftString.Length && rightIndex < rightString.Length;
                 leftIndex++, rightIndex++) {
                var leftChar = leftString[leftIndex];
                var rightChar = rightString[leftIndex];
    
                var leftIsNumber = char.IsNumber(leftChar);
                var rightIsNumber = char.IsNumber(rightChar);
    
                if (!leftIsNumber && !rightIsNumber) {
                    var result = stringComparer.Compare(leftString, leftIndex, 1, rightString, leftIndex, 1);
                    if (result != 0) return result;
                } else if (leftIsNumber && !rightIsNumber) {
                    return LeftIsSmaller;
                } else if (!leftIsNumber && rightIsNumber) {
                    return RightIsSmaller;
                } else {
                    var leftNumberLength = NumberLength(leftString, leftIndex, out var leftNumber);
                    var rightNumberLength = NumberLength(rightString, rightIndex, out var rightNumber);
    
                    if (leftNumberLength < rightNumberLength) {
                        return LeftIsSmaller;
                    } else if (leftNumberLength > rightNumberLength) {
                        return RightIsSmaller;
                    } else {
                        if(leftNumber < rightNumber) {
                            return LeftIsSmaller;
                        } else if(leftNumber > rightNumber) {
                            return RightIsSmaller;
                        }
                    }
                }
            }
    
            if (leftString.Length < rightString.Length) {
                return LeftIsSmaller;
            } else if(leftString.Length > rightString.Length) {
                return RightIsSmaller;
            }
    
            return Equal;
        }
    
        public int NumberLength(string str, int offset, out int number) {
            if (string.IsNullOrWhiteSpace(str)) throw new ArgumentNullException(nameof(str));
            if (offset >= str.Length) throw new ArgumentOutOfRangeException(nameof(offset), offset, "Offset must be less than the length of the string.");
    
            var currentOffset = offset;
    
            var curChar = str[currentOffset];
    
            if (!char.IsNumber(curChar))
                throw new ArgumentException($"'{curChar}' is not a number.", nameof(offset));
    
            int length = 1;
    
            var numberString = string.Empty;
    
            for (currentOffset = offset + 1;
                currentOffset < str.Length;
                currentOffset++, length++) {
    
                curChar = str[currentOffset];
                numberString += curChar;
    
                if (!char.IsNumber(curChar)) {
                    number = int.Parse(numberString);
    
                    return length;
                }
            }
    
            number = int.Parse(numberString);
    
            return length;
        }
    }
    
    0 讨论(0)
提交回复
热议问题