How to programmatically guess whether a CSV file is comma or semicolon delimited

前端 未结 5 1776
滥情空心
滥情空心 2021-01-04 08:29

In most cases, CSV files are text files with records delimited by commas. However, sometimes these files will come semicolon delimited. (Excel will use semicolon delimiter

5条回答
  •  别那么骄傲
    2021-01-04 09:27

    This is my code (no validation on text)... perhaps it could help or make a base :-) !

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text.RegularExpressions;
    using MoreLinq; // http://stackoverflow.com/questions/15265588/how-to-find-item-with-max-value-using-linq
    
    namespace HQ.Util.General.CSV
    {
        public class CsvHelper
        {
            public static Dictionary>  DictionaryOfLineSeparatorAndItsFunc = new Dictionary>();
    
            static CsvHelper()
            {
                DictionaryOfLineSeparatorAndItsFunc[LineSeparator.Unknown] = ParseLineNotSeparated;
                DictionaryOfLineSeparatorAndItsFunc[LineSeparator.Tab] = ParseLineTabSeparated;
                DictionaryOfLineSeparatorAndItsFunc[LineSeparator.Semicolon] = ParseLineSemicolonSeparated;
                DictionaryOfLineSeparatorAndItsFunc[LineSeparator.Comma] = ParseLineCommaSeparated;
            }
    
            // ******************************************************************
            public enum LineSeparator
            {
                Unknown = 0,
                Tab,
                Semicolon,
                Comma
            }
    
            // ******************************************************************
            public static LineSeparator GuessCsvSeparator(string oneLine)
            {
                List> listOfLineSeparatorAndThereFirstLineSeparatedValueCount = new List>();
    
                listOfLineSeparatorAndThereFirstLineSeparatedValueCount.Add(new Tuple(LineSeparator.Tab, CsvHelper.ParseLineTabSeparated(oneLine).Count()));
                listOfLineSeparatorAndThereFirstLineSeparatedValueCount.Add(new Tuple(LineSeparator.Semicolon, CsvHelper.ParseLineSemicolonSeparated(oneLine).Count()));
                listOfLineSeparatorAndThereFirstLineSeparatedValueCount.Add(new Tuple(LineSeparator.Comma, CsvHelper.ParseLineCommaSeparated(oneLine).Count()));
    
                Tuple bestBet = listOfLineSeparatorAndThereFirstLineSeparatedValueCount.MaxBy((n)=>n.Item2);
    
                if (bestBet != null && bestBet.Item2 > 1)
                {
                    return bestBet.Item1;
                }
    
                return LineSeparator.Unknown;
            }
    
            // ******************************************************************
            public static string[] ParseLineCommaSeparated(string line)
            {
                // CSV line parsing : From "jgr4" in http://www.kimgentes.com/worshiptech-web-tools-page/2008/10/14/regex-pattern-for-parsing-csv-files-with-embedded-commas-dou.html
                var matches = Regex.Matches(line, @"\s?((?(?=[,]+))|""(?([^""]|"""")+)""|""(?)""|(?[^,]+)),?",
                                            RegexOptions.ExplicitCapture);
    
                string[] values = (from Match m in matches
                                   select m.Groups["x"].Value.Trim().Replace("\"\"", "\"")).ToArray();
    
                return values;
            }
    
            // ******************************************************************
            public static string[] ParseLineTabSeparated(string line)
            {
                var matchesTab = Regex.Matches(line, @"\s?((?(?=[\t]+))|""(?([^""]|"""")+)""|""(?)""|(?[^\t]+))\t?",
                                RegexOptions.ExplicitCapture);
    
                string[] values = (from Match m in matchesTab
                                    select m.Groups["x"].Value.Trim().Replace("\"\"", "\"")).ToArray();
    
                return values;
            }
    
            // ******************************************************************
            public static string[] ParseLineSemicolonSeparated(string line)
            {
                // CSV line parsing : From "jgr4" in http://www.kimgentes.com/worshiptech-web-tools-page/2008/10/14/regex-pattern-for-parsing-csv-files-with-embedded-commas-dou.html
                var matches = Regex.Matches(line, @"\s?((?(?=[;]+))|""(?([^""]|"""")+)""|""(?)""|(?[^;]+));?",
                                            RegexOptions.ExplicitCapture);
    
                string[] values = (from Match m in matches
                                   select m.Groups["x"].Value.Trim().Replace("\"\"", "\"")).ToArray();
    
                return values;
            }
    
            // ******************************************************************
            public static string[] ParseLineNotSeparated(string line)
            {
                string [] lineValues = new string[1];
                lineValues[0] = line;
                return lineValues;
            }
    
            // ******************************************************************
            public static List ParseText(string text)
            {
                string[] lines = text.Split(new string[] { "\r\n" }, StringSplitOptions.None);
                return ParseString(lines);
            }
    
            // ******************************************************************
            public static List ParseString(string[] lines)
            {
                List result = new List();
    
                LineSeparator lineSeparator = LineSeparator.Unknown;
                if (lines.Any())
                {
                    lineSeparator = GuessCsvSeparator(lines[0]);
                }
    
                Func funcParse = DictionaryOfLineSeparatorAndItsFunc[lineSeparator];
    
                foreach (string line in lines)
                {
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
    
                    result.Add(funcParse(line));
                }
    
                return result;
            }
    
            // ******************************************************************
        }
    }
    

提交回复
热议问题