So i am working with some email header data, and for the to:, from:, cc:, and bcc: fields the email address(es) can be expressed in a number of different ways:
F
Your 2nd email example is not a valid address as it contains a comma which is not within a quoted string. To be valid it should be like: "Last, First"<name@domain.com>
.
As for parsing, if you want something that is quite strict, you could use System.Net.Mail.MailAddressCollection
.
If you just want to your input split into separate email strings, then the following code should work. It is not very strict but will handle commas within quoted strings and throw an exception if the input contains an unclosed quote.
public List<string> SplitAddresses(string addresses)
{
var result = new List<string>();
var startIndex = 0;
var currentIndex = 0;
var inQuotedString = false;
while (currentIndex < addresses.Length)
{
if (addresses[currentIndex] == QUOTE)
{
inQuotedString = !inQuotedString;
}
// Split if a comma is found, unless inside a quoted string
else if (addresses[currentIndex] == COMMA && !inQuotedString)
{
var address = GetAndCleanSubstring(addresses, startIndex, currentIndex);
if (address.Length > 0)
{
result.Add(address);
}
startIndex = currentIndex + 1;
}
currentIndex++;
}
if (currentIndex > startIndex)
{
var address = GetAndCleanSubstring(addresses, startIndex, currentIndex);
if (address.Length > 0)
{
result.Add(address);
}
}
if (inQuotedString)
throw new FormatException("Unclosed quote in email addresses");
return result;
}
private string GetAndCleanSubstring(string addresses, int startIndex, int currentIndex)
{
var address = addresses.Substring(startIndex, currentIndex - startIndex);
address = address.Trim();
return address;
}
At the risk of creating two problems, you could create a regular expression that matches any of your email formats. Use "|" to separate the formats within this one regex. Then you can run it over your input string and pull out all of the matches.
public class Address
{
private string _first;
private string _last;
private string _name;
private string _domain;
public Address(string first, string last, string name, string domain)
{
_first = first;
_last = last;
_name = name;
_domain = domain;
}
public string First
{
get { return _first; }
}
public string Last
{
get { return _last; }
}
public string Name
{
get { return _name; }
}
public string Domain
{
get { return _domain; }
}
}
[TestFixture]
public class RegexEmailTest
{
[Test]
public void TestThreeEmailAddresses()
{
Regex emailAddress = new Regex(
@"((?<last>\w*), (?<first>\w*) <(?<name>\w*)@(?<domain>\w*\.\w*)>)|" +
@"((?<first>\w*) (?<last>\w*) <(?<name>\w*)@(?<domain>\w*\.\w*)>)|" +
@"((?<name>\w*)@(?<domain>\w*\.\w*))");
string input = "First, Last <name@domain.com>, name@domain.com, First Last <name@domain.com>";
MatchCollection matches = emailAddress.Matches(input);
List<Address> addresses =
(from Match match in matches
select new Address(
match.Groups["first"].Value,
match.Groups["last"].Value,
match.Groups["name"].Value,
match.Groups["domain"].Value)).ToList();
Assert.AreEqual(3, addresses.Count);
Assert.AreEqual("Last", addresses[0].First);
Assert.AreEqual("First", addresses[0].Last);
Assert.AreEqual("name", addresses[0].Name);
Assert.AreEqual("domain.com", addresses[0].Domain);
Assert.AreEqual("", addresses[1].First);
Assert.AreEqual("", addresses[1].Last);
Assert.AreEqual("name", addresses[1].Name);
Assert.AreEqual("domain.com", addresses[1].Domain);
Assert.AreEqual("First", addresses[2].First);
Assert.AreEqual("Last", addresses[2].Last);
Assert.AreEqual("name", addresses[2].Name);
Assert.AreEqual("domain.com", addresses[2].Domain);
}
}
There are several down sides to this approach. One is that it doesn't validate the string. If you have any characters in the string that don't fit one of your chosen formats, then those characters are just ignored. Another is that the accepted formats are all expressed in one place. You cannot add new formats without changing the monolithic regex.
// Based on Michael Perry's answer * // needs to handle first.last@domain.com, first_last@domain.com and related syntaxes // also looks for first and last name within those email syntaxes
public class ParsedEmail
{
private string _first;
private string _last;
private string _name;
private string _domain;
public ParsedEmail(string first, string last, string name, string domain)
{
_name = name;
_domain = domain;
// first.last@domain.com, first_last@domain.com etc. syntax
char[] chars = { '.', '_', '+', '-' };
var pos = _name.IndexOfAny(chars);
if (string.IsNullOrWhiteSpace(_first) && string.IsNullOrWhiteSpace(_last) && pos > -1)
{
_first = _name.Substring(0, pos);
_last = _name.Substring(pos+1);
}
}
public string First
{
get { return _first; }
}
public string Last
{
get { return _last; }
}
public string Name
{
get { return _name; }
}
public string Domain
{
get { return _domain; }
}
public string Email
{
get
{
return Name + "@" + Domain;
}
}
public override string ToString()
{
return Email;
}
public static IEnumerable<ParsedEmail> SplitEmailList(string delimList)
{
delimList = delimList.Replace("\"", string.Empty);
Regex re = new Regex(
@"((?<last>\w*), (?<first>\w*) <(?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*)>)|" +
@"((?<first>\w*) (?<last>\w*) <(?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*)>)|" +
@"((?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*))");
MatchCollection matches = re.Matches(delimList);
var parsedEmails =
(from Match match in matches
select new ParsedEmail(
match.Groups["first"].Value,
match.Groups["last"].Value,
match.Groups["name"].Value,
match.Groups["domain"].Value)).ToList();
return parsedEmails;
}
}
Here's what I came up with. It assumes that a valid email address must have one and only one '@' sign in it:
public List<MailAddress> ParseAddresses(string field)
{
var tokens = field.Split(',');
var addresses = new List<string>();
var tokenBuffer = new List<string>();
foreach (var token in tokens)
{
tokenBuffer.Add(token);
if (token.IndexOf("@", StringComparison.Ordinal) > -1)
{
addresses.Add( string.Join( ",", tokenBuffer));
tokenBuffer.Clear();
}
}
return addresses.Select(t => new MailAddress(t)).ToList();
}
There is internal System.Net.Mail.MailAddressParser
class which has method ParseMultipleAddresses
which does exactly what you want. You can access it directly through reflection or by calling MailMessage.To.Add
method, which accepts email list string.
private static IEnumerable<MailAddress> ParseAddress(string addresses)
{
var mailAddressParserClass = Type.GetType("System.Net.Mail.MailAddressParser");
var parseMultipleAddressesMethod = mailAddressParserClass.GetMethod("ParseMultipleAddresses", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
return (IList<MailAddress>)parseMultipleAddressesMethod.Invoke(null, new object[0]);
}
private static IEnumerable<MailAddress> ParseAddress(string addresses)
{
MailMessage message = new MailMessage();
message.To.Add(addresses);
return new List<MailAddress>(message.To); //new List, because we don't want to hold reference on Disposable object
}
There isn't really an easy solution to this. I would recommend making a little state machine that reads char-by-char and do the work that way. Like you said, splitting by comma won't always work.
A state machine will allow you to cover all possibilities. I'm sure there are many others you haven't seen yet. For example: "First Last"
Look for the RFC about this to discover what all the possibilities are. Sorry, I don't know the number. There are probably multiple as this is the kind of things that evolves.