I think that this works fairly well.
var text = @"The green algae (singular: green alga) are ..."; // include all your text
var remove = "().,:[]0123456789".Select(x => x.ToString()).ToArray();
var words =
Regex
.Matches(text, @"(\S+)")
.Cast()
.SelectMany(x => x.Captures.Cast())
.Select(x => remove.Aggregate(x.Value, (t, r) => t.Replace(r, "")))
.Select(x => x.Trim().ToLowerInvariant())
.Where(x => !String.IsNullOrWhiteSpace(x))
.ToArray();
var groups =
from n1 in Enumerable.Range(0, words.Length)
from n2 in Enumerable.Range(1, words.Length - n1)
select String.Join(" ", words.Skip(n1).Take(n2));
var frequencies =
groups
.GroupBy(x => x)
.Select(x => new { wordgroup = x.Key, count = x.Count() })
.OrderByDescending(x => x.count)
.ThenBy(x => x.wordgroup.Count(y => y == ' '))
.ThenBy(x => x.wordgroup)
.ToArray();
This gives me the frequency of every single word grouping of contiguous sequences of words including up to a single word group of all the words.
The number of words is 288. The total number of word groups is 288 x (288 + 1) / 2 = 41,616
. The final number of word groups (after grouping duplicate word groups and removing empty/whitespace strings) is 41,449.
Here are the first 100 of these 41,449:
20 x "the", 13 x "and", 12 x "algae", 12 x "in", 11 x "green", 10 x "of", 9 x "green algae", 8 x "are", 6 x "as", 6 x "species", 5 x "a", 4 x "is", 4 x "or", 4 x "to", 3 x "embryophytes", 3 x "form", 3 x "found", 3 x "lichens", 3 x "live", 3 x "on", 3 x "plants", 3 x "that", 3 x "algae and", 3 x "and in", 3 x "as the", 3 x "in the", 3 x "of the", 2 x "alga", 2 x "can", 2 x "clade", 2 x "class", 2 x "colonial", 2 x "filamentous", 2 x "from", 2 x "higher", 2 x "macroscopic", 2 x "most", 2 x "other", 2 x "seaweeds", 2 x "their", 2 x "trentepohlia", 2 x "while", 2 x "with", 2 x "algae are", 2 x "are a", 2 x "green alga", 2 x "higher plants", 2 x "in lichens", 2 x "of green", 2 x "species of", 2 x "the clade", 2 x "the green", 2 x "green algae and", 2 x "green algae are", 2 x "of green algae", 2 x "species of green", 2 x "the green algae", 2 x "species of green algae", 1 x "about", 1 x "acquired", 1 x "algal", 1 x "also", 1 x "associations", 1 x "bark", 1 x "be", 1 x "both", 1 x "cannot", 1 x "cell", 1 x "cells", 1 x "cellular", 1 x "charales", 1 x "charophyte", 1 x "charophytes", 1 x "chlorarachniophytes", 1 x "chlorophyte", 1 x "chloroplasts", 1 x "ciliate", 1 x "closest", 1 x "coccoid", 1 x "coenobia", 1 x "colonies", 1 x "conduct", 1 x "consisting", 1 x "differentiated", 1 x "differentiation", 1 x "divisions", 1 x "emerged", 1 x "euglenids", 1 x "excluded", 1 x "family", 1 x "few", 1 x "filaments", 1 x "flagella", 1 x "flagellates", 1 x "flatworms", 1 x "for", 1 x "forms", 1 x "full", 1 x "fungal", 1 x "fungi"