Best way to parse string of email addresses

后端未结

关注

 13  2621

So i am working with some email header data, and for the to:, from:, cc:, and bcc: fields the email address(es) can be expressed in a number of different ways:


                      
              相关标签:


      
      
        
          13条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  死守一世寂寞        
                
              
                            
                2021-02-14 05:01
              
            
            
                                                                       
Your 2nd email example is not a valid address as it contains a comma which is not within a quoted string. To be valid it should be like: "Last, First"<name@domain.com>.

As for parsing, if you want something that is quite strict, you could use System.Net.Mail.MailAddressCollection.

If you just want to your input split into separate email strings, then the following code should work. It is not very strict but will handle commas within quoted strings and throw an exception if the input contains an unclosed quote.

public List<string> SplitAddresses(string addresses)
{
    var result = new List<string>();

    var startIndex = 0;
    var currentIndex = 0;
    var inQuotedString = false;

    while (currentIndex < addresses.Length)
    {
        if (addresses[currentIndex] == QUOTE)
        {
            inQuotedString = !inQuotedString;
        }
        // Split if a comma is found, unless inside a quoted string
        else if (addresses[currentIndex] == COMMA && !inQuotedString)
        {
            var address = GetAndCleanSubstring(addresses, startIndex, currentIndex);
            if (address.Length > 0)
            {
                result.Add(address);
            }
            startIndex = currentIndex + 1;
        }
        currentIndex++;
    }

    if (currentIndex > startIndex)
    {
        var address = GetAndCleanSubstring(addresses, startIndex, currentIndex);
        if (address.Length > 0)
        {
            result.Add(address);
        }
    }

    if (inQuotedString)
        throw new FormatException("Unclosed quote in email addresses");

    return result;
}

private string GetAndCleanSubstring(string addresses, int startIndex, int currentIndex)
{
    var address = addresses.Substring(startIndex, currentIndex - startIndex);
    address = address.Trim();
    return address;
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一整个雨季        
                
              
                            
                2021-02-14 05:02
              
            
            
                                                                       
At the risk of creating two problems, you could create a regular expression that matches any of your email formats. Use "|" to separate the formats within this one regex. Then you can run it over your input string and pull out all of the matches.

public class Address
{
    private string _first;
    private string _last;
    private string _name;
    private string _domain;

    public Address(string first, string last, string name, string domain)
    {
        _first = first;
        _last = last;
        _name = name;
        _domain = domain;
    }

    public string First
    {
        get { return _first; }
    }

    public string Last
    {
        get { return _last; }
    }

    public string Name
    {
        get { return _name; }
    }

    public string Domain
    {
        get { return _domain; }
    }
}

[TestFixture]
public class RegexEmailTest
{
    [Test]
    public void TestThreeEmailAddresses()
    {
        Regex emailAddress = new Regex(
            @"((?<last>\w*), (?<first>\w*) <(?<name>\w*)@(?<domain>\w*\.\w*)>)|" +
            @"((?<first>\w*) (?<last>\w*) <(?<name>\w*)@(?<domain>\w*\.\w*)>)|" +
            @"((?<name>\w*)@(?<domain>\w*\.\w*))");
        string input = "First, Last <name@domain.com>, name@domain.com, First Last <name@domain.com>";

        MatchCollection matches = emailAddress.Matches(input);
        List<Address> addresses =
            (from Match match in matches
             select new Address(
                 match.Groups["first"].Value,
                 match.Groups["last"].Value,
                 match.Groups["name"].Value,
                 match.Groups["domain"].Value)).ToList();
        Assert.AreEqual(3, addresses.Count);

        Assert.AreEqual("Last", addresses[0].First);
        Assert.AreEqual("First", addresses[0].Last);
        Assert.AreEqual("name", addresses[0].Name);
        Assert.AreEqual("domain.com", addresses[0].Domain);

        Assert.AreEqual("", addresses[1].First);
        Assert.AreEqual("", addresses[1].Last);
        Assert.AreEqual("name", addresses[1].Name);
        Assert.AreEqual("domain.com", addresses[1].Domain);

        Assert.AreEqual("First", addresses[2].First);
        Assert.AreEqual("Last", addresses[2].Last);
        Assert.AreEqual("name", addresses[2].Name);
        Assert.AreEqual("domain.com", addresses[2].Domain);
    }
}


There are several down sides to this approach. One is that it doesn't validate the string. If you have any characters in the string that don't fit one of your chosen formats, then those characters are just ignored. Another is that the accepted formats are all expressed in one place. You cannot add new formats without changing the monolithic regex.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  说谎        
                
              
                            
                2021-02-14 05:03
              
            
            
                                                                       
//    Based on Michael Perry's answer * 
// needs to handle first.last@domain.com, first_last@domain.com and related syntaxes
// also looks for first and last name within those email syntaxes

public class ParsedEmail
{
    private string _first;
    private string _last;
    private string _name;
    private string _domain;

    public ParsedEmail(string first, string last, string name, string domain)
    {
        _name = name;
        _domain = domain;

        // first.last@domain.com, first_last@domain.com etc. syntax
        char[] chars = { '.', '_', '+', '-' };
        var pos = _name.IndexOfAny(chars);

        if (string.IsNullOrWhiteSpace(_first) && string.IsNullOrWhiteSpace(_last) && pos > -1)
        {
            _first = _name.Substring(0, pos);
            _last = _name.Substring(pos+1);
        }
    }

    public string First
    {
        get { return _first; }
    }

    public string Last
    {
        get { return _last; }
    }

    public string Name
    {
        get { return _name; }
    }

    public string Domain
    {
        get { return _domain; }
    }

    public string Email
    {
        get
        {
            return Name + "@" + Domain;
        }
    }

    public override string ToString()
    {
        return Email;
    }

    public static IEnumerable<ParsedEmail> SplitEmailList(string delimList)
    {
        delimList = delimList.Replace("\"", string.Empty);

        Regex re = new Regex(
                    @"((?<last>\w*), (?<first>\w*) <(?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*)>)|" +
                    @"((?<first>\w*) (?<last>\w*) <(?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*)>)|" +
                    @"((?<name>[a-zA-Z_0-9\.\+\-]+)@(?<domain>\w*\.\w*))");


        MatchCollection matches = re.Matches(delimList);

        var parsedEmails =
                   (from Match match in matches
                    select new ParsedEmail(
                            match.Groups["first"].Value,
                            match.Groups["last"].Value,
                            match.Groups["name"].Value,
                            match.Groups["domain"].Value)).ToList();

        return parsedEmails;

    }


}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一向        
                
              
                            
                2021-02-14 05:06
              
            
            
                                                                       
Here's what I came up with. It assumes that a valid email address must have one and only one '@' sign in it:

    public List<MailAddress> ParseAddresses(string field)
    {
        var tokens = field.Split(',');
        var addresses = new List<string>();

        var tokenBuffer = new List<string>();

        foreach (var token in tokens)
        {
            tokenBuffer.Add(token);

            if (token.IndexOf("@", StringComparison.Ordinal) > -1)
            {
                addresses.Add( string.Join( ",", tokenBuffer));
                tokenBuffer.Clear();
            }
        }

        return addresses.Select(t => new MailAddress(t)).ToList();
    }

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  情深已故        
                
              
                            
                2021-02-14 05:11
              
            
            
                                                                       
There is internal System.Net.Mail.MailAddressParser class which has method ParseMultipleAddresses which does exactly what you want. You can access it directly through reflection or by calling MailMessage.To.Add method, which accepts email list string.

private static IEnumerable<MailAddress> ParseAddress(string addresses)
{
    var mailAddressParserClass = Type.GetType("System.Net.Mail.MailAddressParser");
    var parseMultipleAddressesMethod = mailAddressParserClass.GetMethod("ParseMultipleAddresses", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
    return (IList<MailAddress>)parseMultipleAddressesMethod.Invoke(null, new object[0]);
}


    private static IEnumerable<MailAddress> ParseAddress(string addresses)
    {
        MailMessage message = new MailMessage();
        message.To.Add(addresses);
        return new List<MailAddress>(message.To); //new List, because we don't want to hold reference on Disposable object
    }

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  难免孤独        
                
              
                            
                2021-02-14 05:11
              
            
            
                                                                       
There isn't really an easy solution to this. I would recommend making a little state machine that reads char-by-char and do the work that way. Like you said, splitting by comma won't always work.

A state machine will allow you to cover all possibilities. I'm sure there are many others you haven't seen yet. For example: "First Last" 

Look for the RFC about this to discover what all the possibilities are. Sorry, I don't know the number. There are probably multiple as this is the kind of things that evolves.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     上一页
1
2
3
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复