LINQ - Full Outer Join

后端 未结 16 1564
既然无缘
既然无缘 2020-11-21 22:45

I have a list of people\'s ID and their first name, and a list of people\'s ID and their surname. Some people don\'t have a first name and some don\'t have a surname; I\'d l

相关标签:
16条回答
  • 2020-11-21 23:19

    I think there are problems with most of these, including the accepted answer, because they don't work well with Linq over IQueryable either due to doing too many server round trips and too much data returns, or doing too much client execution.

    For IEnumerable I don't like Sehe's answer or similar because it has excessive memory use (a simple 10000000 two list test ran Linqpad out of memory on my 32GB machine).

    Also, most of the others don't actually implement a proper Full Outer Join because they are using a Union with a Right Join instead of Concat with a Right Anti Semi Join, which not only eliminates the duplicate inner join rows from the result, but any proper duplicates that existed originally in the left or right data.

    So here are my extensions that handle all of these issues, generate SQL as well as implementing the join in LINQ to SQL directly, executing on the server, and is faster and with less memory than others on Enumerables:

    public static class Ext {
        public static IEnumerable<TResult> LeftOuterJoin<TLeft, TRight, TKey, TResult>(
            this IEnumerable<TLeft> leftItems,
            IEnumerable<TRight> rightItems,
            Func<TLeft, TKey> leftKeySelector,
            Func<TRight, TKey> rightKeySelector,
            Func<TLeft, TRight, TResult> resultSelector) {
    
            return from left in leftItems
                   join right in rightItems on leftKeySelector(left) equals rightKeySelector(right) into temp
                   from right in temp.DefaultIfEmpty()
                   select resultSelector(left, right);
        }
    
        public static IEnumerable<TResult> RightOuterJoin<TLeft, TRight, TKey, TResult>(
            this IEnumerable<TLeft> leftItems,
            IEnumerable<TRight> rightItems,
            Func<TLeft, TKey> leftKeySelector,
            Func<TRight, TKey> rightKeySelector,
            Func<TLeft, TRight, TResult> resultSelector) {
    
            return from right in rightItems
                   join left in leftItems on rightKeySelector(right) equals leftKeySelector(left) into temp
                   from left in temp.DefaultIfEmpty()
                   select resultSelector(left, right);
        }
    
        public static IEnumerable<TResult> FullOuterJoinDistinct<TLeft, TRight, TKey, TResult>(
            this IEnumerable<TLeft> leftItems,
            IEnumerable<TRight> rightItems,
            Func<TLeft, TKey> leftKeySelector,
            Func<TRight, TKey> rightKeySelector,
            Func<TLeft, TRight, TResult> resultSelector) {
    
            return leftItems.LeftOuterJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector).Union(leftItems.RightOuterJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector));
        }
    
        public static IEnumerable<TResult> RightAntiSemiJoin<TLeft, TRight, TKey, TResult>(
            this IEnumerable<TLeft> leftItems,
            IEnumerable<TRight> rightItems,
            Func<TLeft, TKey> leftKeySelector,
            Func<TRight, TKey> rightKeySelector,
            Func<TLeft, TRight, TResult> resultSelector) {
    
            var hashLK = new HashSet<TKey>(from l in leftItems select leftKeySelector(l));
            return rightItems.Where(r => !hashLK.Contains(rightKeySelector(r))).Select(r => resultSelector(default(TLeft),r));
        }
    
        public static IEnumerable<TResult> FullOuterJoin<TLeft, TRight, TKey, TResult>(
            this IEnumerable<TLeft> leftItems,
            IEnumerable<TRight> rightItems,
            Func<TLeft, TKey> leftKeySelector,
            Func<TRight, TKey> rightKeySelector,
            Func<TLeft, TRight, TResult> resultSelector)  where TLeft : class {
    
            return leftItems.LeftOuterJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector).Concat(leftItems.RightAntiSemiJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector));
        }
    
        private static Expression<Func<TP, TC, TResult>> CastSMBody<TP, TC, TResult>(LambdaExpression ex, TP unusedP, TC unusedC, TResult unusedRes) => (Expression<Func<TP, TC, TResult>>)ex;
    
        public static IQueryable<TResult> LeftOuterJoin<TLeft, TRight, TKey, TResult>(
            this IQueryable<TLeft> leftItems,
            IQueryable<TRight> rightItems,
            Expression<Func<TLeft, TKey>> leftKeySelector,
            Expression<Func<TRight, TKey>> rightKeySelector,
            Expression<Func<TLeft, TRight, TResult>> resultSelector) {
    
            var sampleAnonLR = new { left = default(TLeft), rightg = default(IEnumerable<TRight>) };
            var parmP = Expression.Parameter(sampleAnonLR.GetType(), "p");
            var parmC = Expression.Parameter(typeof(TRight), "c");
            var argLeft = Expression.PropertyOrField(parmP, "left");
            var newleftrs = CastSMBody(Expression.Lambda(Expression.Invoke(resultSelector, argLeft, parmC), parmP, parmC), sampleAnonLR, default(TRight), default(TResult));
    
            return leftItems.AsQueryable().GroupJoin(rightItems, leftKeySelector, rightKeySelector, (left, rightg) => new { left, rightg }).SelectMany(r => r.rightg.DefaultIfEmpty(), newleftrs);
        }
    
        public static IQueryable<TResult> RightOuterJoin<TLeft, TRight, TKey, TResult>(
            this IQueryable<TLeft> leftItems,
            IQueryable<TRight> rightItems,
            Expression<Func<TLeft, TKey>> leftKeySelector,
            Expression<Func<TRight, TKey>> rightKeySelector,
            Expression<Func<TLeft, TRight, TResult>> resultSelector) {
    
            var sampleAnonLR = new { leftg = default(IEnumerable<TLeft>), right = default(TRight) };
            var parmP = Expression.Parameter(sampleAnonLR.GetType(), "p");
            var parmC = Expression.Parameter(typeof(TLeft), "c");
            var argRight = Expression.PropertyOrField(parmP, "right");
            var newrightrs = CastSMBody(Expression.Lambda(Expression.Invoke(resultSelector, parmC, argRight), parmP, parmC), sampleAnonLR, default(TLeft), default(TResult));
    
            return rightItems.GroupJoin(leftItems, rightKeySelector, leftKeySelector, (right, leftg) => new { leftg, right }).SelectMany(l => l.leftg.DefaultIfEmpty(), newrightrs);
        }
    
        public static IQueryable<TResult> FullOuterJoinDistinct<TLeft, TRight, TKey, TResult>(
            this IQueryable<TLeft> leftItems,
            IQueryable<TRight> rightItems,
            Expression<Func<TLeft, TKey>> leftKeySelector,
            Expression<Func<TRight, TKey>> rightKeySelector,
            Expression<Func<TLeft, TRight, TResult>> resultSelector) {
    
            return leftItems.LeftOuterJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector).Union(leftItems.RightOuterJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector));
        }
    
        private static Expression<Func<TP, TResult>> CastSBody<TP, TResult>(LambdaExpression ex, TP unusedP, TResult unusedRes) => (Expression<Func<TP, TResult>>)ex;
    
        public static IQueryable<TResult> RightAntiSemiJoin<TLeft, TRight, TKey, TResult>(
            this IQueryable<TLeft> leftItems,
            IQueryable<TRight> rightItems,
            Expression<Func<TLeft, TKey>> leftKeySelector,
            Expression<Func<TRight, TKey>> rightKeySelector,
            Expression<Func<TLeft, TRight, TResult>> resultSelector) {
    
            var sampleAnonLgR = new { leftg = default(IEnumerable<TLeft>), right = default(TRight) };
            var parmLgR = Expression.Parameter(sampleAnonLgR.GetType(), "lgr");
            var argLeft = Expression.Constant(default(TLeft), typeof(TLeft));
            var argRight = Expression.PropertyOrField(parmLgR, "right");
            var newrightrs = CastSBody(Expression.Lambda(Expression.Invoke(resultSelector, argLeft, argRight), parmLgR), sampleAnonLgR, default(TResult));
    
            return rightItems.GroupJoin(leftItems, rightKeySelector, leftKeySelector, (right, leftg) => new { leftg, right }).Where(lgr => !lgr.leftg.Any()).Select(newrightrs);
        }
    
        public static IQueryable<TResult> FullOuterJoin<TLeft, TRight, TKey, TResult>(
            this IQueryable<TLeft> leftItems,
            IQueryable<TRight> rightItems,
            Expression<Func<TLeft, TKey>> leftKeySelector,
            Expression<Func<TRight, TKey>> rightKeySelector,
            Expression<Func<TLeft, TRight, TResult>> resultSelector) {
    
            return leftItems.LeftOuterJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector).Concat(leftItems.RightAntiSemiJoin(rightItems, leftKeySelector, rightKeySelector, resultSelector));
        }
    }
    

    The difference between a Right Anti-Semi-Join is mostly moot with Linq to Objects or in the source, but makes a difference on the server (SQL) side in the final answer, removing an unnecessary JOIN.

    The hand coding of Expression to handle merging an Expression<Func<>> into a lambda could be improved with LinqKit, but it would be nice if the language/compiler had added some help for that. The FullOuterJoinDistinct and RightOuterJoin functions are included for completeness, but I did not re-implement FullOuterGroupJoin yet.

    I wrote another version of a full outer join for IEnumerable for cases where the key is orderable, which is about 50% faster than combining the left outer join with the right anti semi join, at least on small collections. It goes through each collection after sorting just once.

    I also added another answer for a version that works with EF by replacing the Invoke with a custom expansion.

    0 讨论(0)
  • 2020-11-21 23:19

    I really hate these linq expressions, this is why SQL exists:

    select isnull(fn.id, ln.id) as id, fn.firstname, ln.lastname
       from firstnames fn
       full join lastnames ln on ln.id=fn.id
    

    Create this as sql view in database and import it as entity.

    Of course, (distinct) union of left and right joins will make it too, but it is stupid.

    0 讨论(0)
  • 2020-11-21 23:22

    Update 1: providing a truly generalized extension method FullOuterJoin
    Update 2: optionally accepting a custom IEqualityComparer for the key type
    Update 3: this implementation has recently become part of MoreLinq - Thanks guys!

    Edit Added FullOuterGroupJoin (ideone). I reused the GetOuter<> implementation, making this a fraction less performant than it could be, but I'm aiming for 'highlevel' code, not bleeding-edge optimized, right now.

    See it live on http://ideone.com/O36nWc

    static void Main(string[] args)
    {
        var ax = new[] { 
            new { id = 1, name = "John" },
            new { id = 2, name = "Sue" } };
        var bx = new[] { 
            new { id = 1, surname = "Doe" },
            new { id = 3, surname = "Smith" } };
    
        ax.FullOuterJoin(bx, a => a.id, b => b.id, (a, b, id) => new {a, b})
            .ToList().ForEach(Console.WriteLine);
    }
    

    Prints the output:

    { a = { id = 1, name = John }, b = { id = 1, surname = Doe } }
    { a = { id = 2, name = Sue }, b =  }
    { a = , b = { id = 3, surname = Smith } }
    

    You could also supply defaults: http://ideone.com/kG4kqO

        ax.FullOuterJoin(
                bx, a => a.id, b => b.id, 
                (a, b, id) => new { a.name, b.surname },
                new { id = -1, name    = "(no firstname)" },
                new { id = -2, surname = "(no surname)" }
            )
    

    Printing:

    { name = John, surname = Doe }
    { name = Sue, surname = (no surname) }
    { name = (no firstname), surname = Smith }
    

    Explanation of terms used:

    Joining is a term borrowed from relational database design:

    • A join will repeat elements from a as many times as there are elements in b with corresponding key (i.e.: nothing if b were empty). Database lingo calls this inner (equi)join.
    • An outer join includes elements from a for which no corresponding element exists in b. (i.e.: even results if b were empty). This is usually referred to as left join.
    • A full outer join includes records from a as well as b if no corresponding element exists in the other. (i.e. even results if a were empty)

    Something not usually seen in RDBMS is a group join[1]:

    • A group join, does the same as described above, but instead of repeating elements from a for multiple corresponding b, it groups the records with corresponding keys. This is often more convenient when you wish to enumerate through 'joined' records, based on a common key.

    See also GroupJoin which contains some general background explanations as well.


    [1] (I believe Oracle and MSSQL have proprietary extensions for this)

    Full code

    A generalized 'drop-in' Extension class for this

    internal static class MyExtensions
    {
        internal static IEnumerable<TResult> FullOuterGroupJoin<TA, TB, TKey, TResult>(
            this IEnumerable<TA> a,
            IEnumerable<TB> b,
            Func<TA, TKey> selectKeyA, 
            Func<TB, TKey> selectKeyB,
            Func<IEnumerable<TA>, IEnumerable<TB>, TKey, TResult> projection,
            IEqualityComparer<TKey> cmp = null)
        {
            cmp = cmp?? EqualityComparer<TKey>.Default;
            var alookup = a.ToLookup(selectKeyA, cmp);
            var blookup = b.ToLookup(selectKeyB, cmp);
    
            var keys = new HashSet<TKey>(alookup.Select(p => p.Key), cmp);
            keys.UnionWith(blookup.Select(p => p.Key));
    
            var join = from key in keys
                       let xa = alookup[key]
                       let xb = blookup[key]
                       select projection(xa, xb, key);
    
            return join;
        }
    
        internal static IEnumerable<TResult> FullOuterJoin<TA, TB, TKey, TResult>(
            this IEnumerable<TA> a,
            IEnumerable<TB> b,
            Func<TA, TKey> selectKeyA, 
            Func<TB, TKey> selectKeyB,
            Func<TA, TB, TKey, TResult> projection,
            TA defaultA = default(TA), 
            TB defaultB = default(TB),
            IEqualityComparer<TKey> cmp = null)
        {
            cmp = cmp?? EqualityComparer<TKey>.Default;
            var alookup = a.ToLookup(selectKeyA, cmp);
            var blookup = b.ToLookup(selectKeyB, cmp);
    
            var keys = new HashSet<TKey>(alookup.Select(p => p.Key), cmp);
            keys.UnionWith(blookup.Select(p => p.Key));
    
            var join = from key in keys
                       from xa in alookup[key].DefaultIfEmpty(defaultA)
                       from xb in blookup[key].DefaultIfEmpty(defaultB)
                       select projection(xa, xb, key);
    
            return join;
        }
    }
    
    0 讨论(0)
  • 2020-11-21 23:23

    My clean solution for situation that key is unique in both enumerables:

     private static IEnumerable<TResult> FullOuterJoin<Ta, Tb, TKey, TResult>(
                IEnumerable<Ta> a, IEnumerable<Tb> b,
                Func<Ta, TKey> key_a, Func<Tb, TKey> key_b,
                Func<Ta, Tb, TResult> selector)
            {
                var alookup = a.ToLookup(key_a);
                var blookup = b.ToLookup(key_b);
                var keys = new HashSet<TKey>(alookup.Select(p => p.Key));
                keys.UnionWith(blookup.Select(p => p.Key));
                return keys.Select(key => selector(alookup[key].FirstOrDefault(), blookup[key].FirstOrDefault()));
            }
    

    so

        var ax = new[] {
            new { id = 1, first_name = "ali" },
            new { id = 2, first_name = "mohammad" } };
        var bx = new[] {
            new { id = 1, last_name = "rezaei" },
            new { id = 3, last_name = "kazemi" } };
    
        var list = FullOuterJoin(ax, bx, a => a.id, b => b.id, (a, b) => "f: " + a?.first_name + " l: " + b?.last_name).ToArray();
    

    outputs:

    f: ali l: rezaei
    f: mohammad l:
    f:  l: kazemi
    
    0 讨论(0)
提交回复
热议问题