I have a lot of code in which I do something like this
bool GetIsUnique(IEnumerable values)
{
return values.Count() == values.Distinct().Count;
}
>
I'm suprised no one has tested this just yet:
Comparing Gluip's version in the question with JamieC, LukeK, and MrKWatkins, All three answers are better than the asker's version.
Of the three answers, they are all pretty comparable but in most case JamieC's is slightly faster.
When the data has no duplicates or a duplicate is at the end of the IEnumerable, the size or algorithm had no major differences.
When the data has a duplicate near the middle, or at the beginning, Gluip's version in the original question shows its slowness compared to the other three.
The time to check a set appears to scale linearly with the size of set, meaning no algorithm is preferable for large or small sets.
Here's a test program that spits out a CSV that you can load into a spreadsheet program and sort and graph if you like:
Test Program:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AreUniqueTest
{
class Program
{
const int Iterations = 1000;
enum DupeLocation
{
None,
Early,
Center,
Late,
}
enum SetSize
{
Tiny= 10,
Small = 100,
Medium = 1000,
Large = 10000,
Huge = 100000,
}
static void Main()
{
Dictionary, bool>> functions = new Dictionary, bool>>
{
{"Gluip", GetIsUniqueGluip},
{"LukeH", GetIsUniqueLukeH },
{"Jamiec", GetIsUniqueJamiec },
{"MrKWatkins", GetIsUniqueMrKWatkins }
};
var output = new StringBuilder();
Console.WriteLine("Function,SetSize,DupeLocation,TotalTicks,AverageTicks");
output.AppendLine("Function,SetSize,DupeLocation,TotalTicks,AverageTicks");
foreach (SetSize size in Enum.GetValues(typeof(SetSize)))
{
var sizevalue = (int) size;
foreach (DupeLocation location in Enum.GetValues(typeof(DupeLocation)))
{
var data = CreateTestData((int)size, location);
foreach (string functionKey in functions.Keys)
{
var ticks = RunSet(functions[functionKey], data, Iterations);
var avg = ticks / Iterations;
Console.WriteLine($"{functionKey},{sizevalue},{location},{ticks},{avg}");
output.AppendLine($"{functionKey},{sizevalue},{location},{ticks},{avg}");
}
}
}
File.WriteAllText("output.csv", output.ToString());
Process.Start("output.csv");
}
static long RunSet(Func, bool> getIsUnique, IEnumerable values, int iterations)
{
var stopwatch = new Stopwatch();
stopwatch.Start();
for (var i = 0; i < iterations; i++)
{
getIsUnique.Invoke(values);
}
stopwatch.Stop();
return stopwatch.ElapsedTicks;
}
static bool GetIsUniqueLukeH(IEnumerable values)
{
var set = new HashSet();
foreach (T item in values)
{
if (!set.Add(item))
return false;
}
return true;
}
static bool GetIsUniqueGluip(IEnumerable values)
{
return values.Count() == values.Distinct().Count();
}
static bool GetIsUniqueJamiec(IEnumerable list)
{
var hs = new HashSet();
return list.All(hs.Add);
}
static bool GetIsUniqueMrKWatkins(IEnumerable values)
{
HashSet hashSet = new HashSet();
foreach (var value in values)
{
if (hashSet.Contains(value))
{
return false;
}
hashSet.Add(value);
}
return true;
}
static int[] CreateTestData(int size, DupeLocation location)
{
var result = new int[size];
Parallel.For(0, size, i => { result[i] = i; });
return SetDupe(result, location);
}
static int[] SetDupe(int[] values, DupeLocation location)
{
switch (location)
{
case DupeLocation.Early:
values[1] = values[0];
break;
case DupeLocation.Center:
var midpoint = values.Length / 2;
values[midpoint] = values[midpoint + 1];
break;
case DupeLocation.Late:
values[values.Length - 1] = values[values.Length - 2];
break;
// case DupeLocation.None: // do nothing.
}
return values;
}
}
}