How can I parse a CSV string with JavaScript, which contains comma in data?

前端 未结 17 913
不知归路
不知归路 2020-11-22 01:52

I have the following type of string

var string = "\'string, duppi, du\', 23, lala"

I want to split the string into an array on each

相关标签:
17条回答
  • 2020-11-22 02:06

    To complement this answer

    If you need to parse quotes escaped with another quote, example:

    "some ""value"" that is on xlsx file",123
    

    You can use

    function parse(text) {
      const csvExp = /(?!\s*$)\s*(?:'([^'\\]*(?:\\[\S\s][^'\\]*)*)'|"([^"\\]*(?:\\[\S\s][^"\\]*)*)"|"([^""]*(?:"[\S\s][^""]*)*)"|([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*))\s*(?:,|$)/g;
    
      const values = [];
    
      text.replace(csvExp, (m0, m1, m2, m3, m4) => {
        if (m1 !== undefined) {
          values.push(m1.replace(/\\'/g, "'"));
        }
        else if (m2 !== undefined) {
          values.push(m2.replace(/\\"/g, '"'));
        }
        else if (m3 !== undefined) {
          values.push(m3.replace(/""/g, '"'));
        }
        else if (m4 !== undefined) {
          values.push(m4);
        }
        return '';
      });
    
      if (/,\s*$/.test(text)) {
        values.push('');
      }
    
      return values;
    }
    
    0 讨论(0)
  • 2020-11-22 02:06

    According to this blog post, this function should do it:

    String.prototype.splitCSV = function(sep) {
      for (var foo = this.split(sep = sep || ","), x = foo.length - 1, tl; x >= 0; x--) {
        if (foo[x].replace(/'\s+$/, "'").charAt(foo[x].length - 1) == "'") {
          if ((tl = foo[x].replace(/^\s+'/, "'")).length > 1 && tl.charAt(0) == "'") {
            foo[x] = foo[x].replace(/^\s*'|'\s*$/g, '').replace(/''/g, "'");
          } else if (x) {
            foo.splice(x - 1, 2, [foo[x - 1], foo[x]].join(sep));
          } else foo = foo.shift().split(sep).concat(foo);
        } else foo[x].replace(/''/g, "'");
      } return foo;
    };
    

    You would call it like so:

    var string = "'string, duppi, du', 23, lala";
    var parsed = string.splitCSV();
    alert(parsed.join("|"));
    

    This jsfiddle kind of works, but it looks like some of the elements have spaces before them.

    0 讨论(0)
  • 2020-11-22 02:08

    While reading the CSV file into a string, it contains null values in between strings, so try it with \0 line by line. It works for me.

    stringLine = stringLine.replace(/\0/g, "" );
    
    0 讨论(0)
  • 2020-11-22 02:10

    RFC 4180 solution

    This does not solve the string in the question since its format is not conforming with RFC 4180; the acceptable encoding is escaping double quote with double quote. The solution below works correctly with CSV files d/l from google spreadsheets.

    UPDATE (3/2017)

    Parsing single line would be wrong. According to RFC 4180 fields may contain CRLF which will cause any line reader to break the CSV file. Here is an updated version that parses CSV string:

    'use strict';
    
    function csvToArray(text) {
        let p = '', row = [''], ret = [row], i = 0, r = 0, s = !0, l;
        for (l of text) {
            if ('"' === l) {
                if (s && l === p) row[i] += l;
                s = !s;
            } else if (',' === l && s) l = row[++i] = '';
            else if ('\n' === l && s) {
                if ('\r' === p) row[i] = row[i].slice(0, -1);
                row = ret[++r] = [l = '']; i = 0;
            } else row[i] += l;
            p = l;
        }
        return ret;
    };
    
    let test = '"one","two with escaped """" double quotes""","three, with, commas",four with no quotes,"five with CRLF\r\n"\r\n"2nd line one","two with escaped """" double quotes""","three, with, commas",four with no quotes,"five with CRLF\r\n"';
    console.log(csvToArray(test));

    OLD ANSWER

    (Single line solution)

    function CSVtoArray(text) {
        let ret = [''], i = 0, p = '', s = true;
        for (let l in text) {
            l = text[l];
            if ('"' === l) {
                s = !s;
                if ('"' === p) {
                    ret[i] += '"';
                    l = '-';
                } else if ('' === p)
                    l = '-';
            } else if (s && ',' === l)
                l = ret[++i] = '';
            else
                ret[i] += l;
            p = l;
        }
        return ret;
    }
    let test = '"one","two with escaped """" double quotes""","three, with, commas",four with no quotes,five for fun';
    console.log(CSVtoArray(test));

    And for the fun, here is how you create CSV from the array:

    function arrayToCSV(row) {
        for (let i in row) {
            row[i] = row[i].replace(/"/g, '""');
        }
        return '"' + row.join('","') + '"';
    }
    
    let row = [
      "one",
      "two with escaped \" double quote",
      "three, with, commas",
      "four with no quotes (now has)",
      "five for fun"
    ];
    let text = arrayToCSV(row);
    console.log(text);

    0 讨论(0)
  • 2020-11-22 02:15

    Disclaimer

    2014-12-01 Update: The answer below works only for one very specific format of CSV. As correctly pointed out by DG in the comments, this solution does not fit the RFC 4180 definition of CSV and it also does not fit Microsoft Excel format. This solution simply demonstrates how one can parse one (non-standard) CSV line of input which contains a mix of string types, where the strings may contain escaped quotes and commas.

    A non-standard CSV solution

    As austincheney correctly points out, you really need to parse the string from start to finish if you wish to properly handle quoted strings that may contain escaped characters. Also, the OP does not clearly define what a "CSV string" really is. First we must define what constitutes a valid CSV string and its individual values.

    Given: "CSV String" Definition

    For the purpose of this discussion, a "CSV string" consists of zero or more values, where multiple values are separated by a comma. Each value may consist of:

    1. A double quoted string (may contain unescaped single quotes).
    2. A single quoted string (may contain unescaped double quotes).
    3. A non-quoted string (may not contain quotes, commas or backslashes).
    4. An empty value. (An all whitespace value is considered empty.)

    Rules/Notes:

    • Quoted values may contain commas.
    • Quoted values may contain escaped-anything, e.g. 'that\'s cool'.
    • Values containing quotes, commas, or backslashes must be quoted.
    • Values containing leading or trailing whitespace must be quoted.
    • The backslash is removed from all: \' in single quoted values.
    • The backslash is removed from all: \" in double quoted values.
    • Non-quoted strings are trimmed of any leading and trailing spaces.
    • The comma separator may have adjacent whitespace (which is ignored).

    Find:

    A JavaScript function which converts a valid CSV string (as defined above) into an array of string values.

    Solution:

    The regular expressions used by this solution are complex. And (IMHO) all non-trivial regular expressions should be presented in free-spacing mode with lots of comments and indentation. Unfortunately, JavaScript does not allow free-spacing mode. Thus, the regular expressions implemented by this solution are first presented in native regular expressions syntax (expressed using Python's handy r'''...''' raw-multi-line-string syntax).

    First here is a regular expression which validates that a CVS string meets the above requirements:

    Regular expression to validate a "CSV string":

    re_valid = r"""
    # Validate a CSV string having single, double or un-quoted values.
    ^                                   # Anchor to start of string.
    \s*                                 # Allow whitespace before value.
    (?:                                 # Group for value alternatives.
      '[^'\\]*(?:\\[\S\s][^'\\]*)*'     # Either Single quoted string,
    | "[^"\\]*(?:\\[\S\s][^"\\]*)*"     # or Double quoted string,
    | [^,'"\s\\]*(?:\s+[^,'"\s\\]+)*    # or Non-comma, non-quote stuff.
    )                                   # End group of value alternatives.
    \s*                                 # Allow whitespace after value.
    (?:                                 # Zero or more additional values
      ,                                 # Values separated by a comma.
      \s*                               # Allow whitespace before value.
      (?:                               # Group for value alternatives.
        '[^'\\]*(?:\\[\S\s][^'\\]*)*'   # Either Single quoted string,
      | "[^"\\]*(?:\\[\S\s][^"\\]*)*"   # or Double quoted string,
      | [^,'"\s\\]*(?:\s+[^,'"\s\\]+)*  # or Non-comma, non-quote stuff.
      )                                 # End group of value alternatives.
      \s*                               # Allow whitespace after value.
    )*                                  # Zero or more additional values
    $                                   # Anchor to end of string.
    """
    

    If a string matches the above regular expression, then that string is a valid CSV string (according to the rules previously stated) and may be parsed using the following regular expression. The following regular expression is then used to match one value from the CSV string. It is applied repeatedly until no more matches are found (and all values have been parsed).

    Regular expression to parse one value from a valid CSV string:

    re_value = r"""
    # Match one value in valid CSV string.
    (?!\s*$)                            # Don't match empty last value.
    \s*                                 # Strip whitespace before value.
    (?:                                 # Group for value alternatives.
      '([^'\\]*(?:\\[\S\s][^'\\]*)*)'   # Either $1: Single quoted string,
    | "([^"\\]*(?:\\[\S\s][^"\\]*)*)"   # or $2: Double quoted string,
    | ([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*)  # or $3: Non-comma, non-quote stuff.
    )                                   # End group of value alternatives.
    \s*                                 # Strip whitespace after value.
    (?:,|$)                             # Field ends on comma or EOS.
    """
    

    Note that there is one special case value that this regular expression does not match - the very last value when that value is empty. This special "empty last value" case is tested for and handled by the JavaScript function which follows.

    JavaScript function to parse CSV string:

    // Return array of string values, or NULL if CSV string not well formed.
    function CSVtoArray(text) {
        var re_valid = /^\s*(?:'[^'\\]*(?:\\[\S\s][^'\\]*)*'|"[^"\\]*(?:\\[\S\s][^"\\]*)*"|[^,'"\s\\]*(?:\s+[^,'"\s\\]+)*)\s*(?:,\s*(?:'[^'\\]*(?:\\[\S\s][^'\\]*)*'|"[^"\\]*(?:\\[\S\s][^"\\]*)*"|[^,'"\s\\]*(?:\s+[^,'"\s\\]+)*)\s*)*$/;
        var re_value = /(?!\s*$)\s*(?:'([^'\\]*(?:\\[\S\s][^'\\]*)*)'|"([^"\\]*(?:\\[\S\s][^"\\]*)*)"|([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*))\s*(?:,|$)/g;
    
        // Return NULL if input string is not well formed CSV string.
        if (!re_valid.test(text)) return null;
    
        var a = []; // Initialize array to receive values.
        text.replace(re_value, // "Walk" the string using replace with callback.
            function(m0, m1, m2, m3) {
    
                // Remove backslash from \' in single quoted values.
                if (m1 !== undefined) a.push(m1.replace(/\\'/g, "'"));
    
                // Remove backslash from \" in double quoted values.
                else if (m2 !== undefined) a.push(m2.replace(/\\"/g, '"'));
                else if (m3 !== undefined) a.push(m3);
                return ''; // Return empty string.
            });
    
        // Handle special case of empty last value.
        if (/,\s*$/.test(text)) a.push('');
        return a;
    };
    

    Example input and output:

    In the following examples, curly braces are used to delimit the {result strings}. (This is to help visualize leading/trailing spaces and zero-length strings.)

    // Test 1: Test string from original question.
    var test = "'string, duppi, du', 23, lala";
    var a = CSVtoArray(test);
    /* Array has three elements:
        a[0] = {string, duppi, du}
        a[1] = {23}
        a[2] = {lala} */
    
    // Test 2: Empty CSV string.
    var test = "";
    var a = CSVtoArray(test);
    /* Array has zero elements: */
    
    // Test 3: CSV string with two empty values.
    var test = ",";
    var a = CSVtoArray(test);
    /* Array has two elements:
        a[0] = {}
        a[1] = {} */
    
    // Test 4: Double quoted CSV string having single quoted values.
    var test = "'one','two with escaped \' single quote', 'three, with, commas'";
    var a = CSVtoArray(test);
    /* Array has three elements:
        a[0] = {one}
        a[1] = {two with escaped ' single quote}
        a[2] = {three, with, commas} */
    
    // Test 5: Single quoted CSV string having double quoted values.
    var test = '"one","two with escaped \" double quote", "three, with, commas"';
    var a = CSVtoArray(test);
    /* Array has three elements:
        a[0] = {one}
        a[1] = {two with escaped " double quote}
        a[2] = {three, with, commas} */
    
    // Test 6: CSV string with whitespace in and around empty and non-empty values.
    var test = "   one  ,  'two'  ,  , ' four' ,, 'six ', ' seven ' ,  ";
    var a = CSVtoArray(test);
    /* Array has eight elements:
        a[0] = {one}
        a[1] = {two}
        a[2] = {}
        a[3] = { four}
        a[4] = {}
        a[5] = {six }
        a[6] = { seven }
        a[7] = {} */
    

    Additional notes:

    This solution requires that the CSV string be "valid". For example, unquoted values may not contain backslashes or quotes, e.g. the following CSV string is not valid:

    var invalid1 = "one, that's me!, escaped \, comma"
    

    This is not really a limitation because any sub-string may be represented as either a single or double quoted value. Note also that this solution represents only one possible definition for "comma-separated values".

    Edit history

    • 2014-05-19: Added disclaimer.
    • 2014-12-01: Moved disclaimer to top.
    0 讨论(0)
  • 2020-11-22 02:16

    I had a very specific use case where I wanted to copy cells from Google Sheets into my web app. Cells could include double-quotes and new-line characters. Using copy and paste, the cells are delimited by a tab characters, and cells with odd data are double quoted. I tried this main solution, the linked article using regexp, and Jquery-CSV, and CSVToArray. http://papaparse.com/ Is the only one that worked out of the box. Copy and paste is seamless with Google Sheets with default auto-detect options.

    0 讨论(0)
提交回复
热议问题