Split string with delimiters in C

前端 未结 20 1388
你的背包
你的背包 2020-11-21 11:56

How do I write a function to split and return an array for a string with delimiters in the C programming language?

char* str = \"JAN,FEB,MAR,APR,MAY,JUN,JUL,         


        
相关标签:
20条回答
  • 2020-11-21 12:12

    Here is my two cents:

    int split (const char *txt, char delim, char ***tokens)
    {
        int *tklen, *t, count = 1;
        char **arr, *p = (char *) txt;
    
        while (*p != '\0') if (*p++ == delim) count += 1;
        t = tklen = calloc (count, sizeof (int));
        for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
        *tokens = arr = malloc (count * sizeof (char *));
        t = tklen;
        p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
        while (*txt != '\0')
        {
            if (*txt == delim)
            {
                p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
                txt++;
            }
            else *p++ = *txt++;
        }
        free (tklen);
        return count;
    }
    

    Usage:

    char **tokens;
    int count, i;
    const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    
    count = split (str, ',', &tokens);
    for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
    
    /* freeing tokens */
    for (i = 0; i < count; i++) free (tokens[i]);
    free (tokens);
    
    0 讨论(0)
  • 2020-11-21 12:14

    String tokenizer this code should put you in the right direction.

    int main(void) {
      char st[] ="Where there is will, there is a way.";
      char *ch;
      ch = strtok(st, " ");
      while (ch != NULL) {
      printf("%s\n", ch);
      ch = strtok(NULL, " ,");
      }
      getch();
      return 0;
    }
    
    0 讨论(0)
  • 2020-11-21 12:15

    This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.

    This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)

    void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
        const int lo = strlen(original);
        const int ld = strlen(delimiter);
        if(ld > lo){
            *buffer = (void *)0;
            *numStrings = 0;
            *stringLengths = (void *)0;
            return;
        }
    
        *numStrings = 1;
    
        for(int i = 0;i < (lo - ld);i++){
            if(strncmp(&original[i], delimiter, ld) == 0) {
                i += (ld - 1);
                (*numStrings)++;
            }
        }
    
        *stringLengths = (int *) malloc(sizeof(int) * *numStrings);
    
        int currentStringLength = 0;
        int currentStringNumber = 0;
        int delimiterTokenDecrementCounter = 0;
        for(int i = 0;i < lo;i++){
            if(delimiterTokenDecrementCounter > 0){
                delimiterTokenDecrementCounter--;
            } else if(i < (lo - ld)){
                if(strncmp(&original[i], delimiter, ld) == 0){
                    (*stringLengths)[currentStringNumber] = currentStringLength;
                    currentStringNumber++;
                    currentStringLength = 0;
                    delimiterTokenDecrementCounter = ld - 1;
                } else {
                    currentStringLength++;
                }
            } else {
                currentStringLength++;
            }
    
            if(i == (lo - 1)){
                (*stringLengths)[currentStringNumber] = currentStringLength;
            }
        }
    
        *buffer = (char **) malloc(sizeof(char *) * (*numStrings));
        for(int i = 0;i < *numStrings;i++){
            (*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
        }
    
        currentStringNumber = 0;
        currentStringLength = 0;
        delimiterTokenDecrementCounter = 0;
        for(int i = 0;i < lo;i++){
            if(delimiterTokenDecrementCounter > 0){
                delimiterTokenDecrementCounter--;
            } else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
                (*buffer)[currentStringNumber][currentStringLength] = 0;
                delimiterTokenDecrementCounter = ld - 1;
                currentStringLength = 0;
                currentStringNumber++;
            } else {
                (*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
                currentStringLength++;
            }
        }
        buffer[currentStringNumber][currentStringLength] = 0;
    }
    

    Sample code:

    int main(){
        const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
        char **buffer;
        int numStrings;
        int * stringLengths;
    
        splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
    
        for(int i = 0;i < numStrings;i++){
            printf("String: %s\n", buffer[i]);
        }
    }
    

    Libraries:

    #include <stdlib.h>
    #include <string.h>
    #include <stdio.h>
    
    0 讨论(0)
  • 2020-11-21 12:17

    I think the following solution is ideal:

    • Doesn't destroy the source string
    • Re-entrant - i.e., you can safely call it from anywhere in one or more threads
    • Portable
    • Handles multiple separators correctly
    • Fast and efficient

    Explanation of the code:

    1. Define a structure token to store the address and lengths of the tokens
    2. Allocate enough memory for these in the worst case, which is when str is made up entirely of separators so there are strlen(str) + 1 tokens, all of them empty strings
    3. Scan str recording the address and length of every token
    4. Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
    5. Allocate, copy, and add the tokens using the start and length information - use memcpy as it's faster than strcpy and we know the lengths
    6. Free the token address and length array
    7. Return the array of tokens
    typedef struct {
        const char *start;
        size_t len;
    } token;
    
    char **split(const char *str, char sep)
    {
        char **array;
        unsigned int start = 0, stop, toks = 0, t;
        token *tokens = malloc((strlen(str) + 1) * sizeof(token));
        for (stop = 0; str[stop]; stop++) {
            if (str[stop] == sep) {
                tokens[toks].start = str + start;
                tokens[toks].len = stop - start;
                toks++;
                start = stop + 1;
            }
        }
        /* Mop up the last token */
        tokens[toks].start = str + start;
        tokens[toks].len = stop - start;
        toks++;
        array = malloc((toks + 1) * sizeof(char*));
        for (t = 0; t < toks; t++) {
            /* Calloc makes it nul-terminated */
            char *token = calloc(tokens[t].len + 1, 1);
            memcpy(token, tokens[t].start, tokens[t].len);
            array[t] = token;
        }
        /* Add a sentinel */
        array[t] = NULL; 
        free(tokens);
        return array;
    }

    Note malloc checking omitted for brevity.

    In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.

    0 讨论(0)
  • 2020-11-21 12:18

    Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.

    The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.

    As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".

    The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    #define NPTRS 2     /* initial number of pointers to allocate (must be > 0) */
    
    /* split src into tokens with sentinel NULL after last token.
     * return allocated pointer-to-pointer with sentinel NULL on success,
     * or NULL on failure to allocate initial block of pointers. The number
     * of allocated pointers are doubled each time reallocation required.
     */
    char **strsplit (const char *src, const char *delim)
    {
        int i = 0, in = 0, nptrs = NPTRS;       /* index, in/out flag, ptr count */
        char **dest = NULL;                     /* ptr-to-ptr to allocate/fill */
        const char *p = src, *ep = p;           /* pointer and end-pointer */
    
        /* allocate/validate nptrs pointers for dest */
        if (!(dest = malloc (nptrs * sizeof *dest))) {
            perror ("malloc-dest");
            return NULL;
        }
        *dest = NULL;   /* set first pointer as sentinel NULL */
    
        for (;;) {  /* loop continually until end of src reached */
            if (!*ep || strchr (delim, *ep)) {  /* if at nul-char or delimiter char */
                size_t len = ep - p;            /* get length of token */
                if (in && len) {                /* in-word and chars in token */
                    if (i == nptrs - 1) {       /* used pointer == allocated - 1? */
                        /* realloc dest to temporary pointer/validate */
                        void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
                        if (!tmp) {
                            perror ("realloc-dest");
                            break;  /* don't exit, original dest still valid */
                        }
                        dest = tmp;             /* assign reallocated block to dest */
                        nptrs *= 2;             /* increment allocated pointer count */
                    }
                    /* allocate/validate storage for token */
                    if (!(dest[i] = malloc (len + 1))) {
                        perror ("malloc-dest[i]");
                        break;
                    }
                    memcpy (dest[i], p, len);   /* copy len chars to storage */
                    dest[i++][len] = 0;         /* nul-terminate, advance index */
                    dest[i] = NULL;             /* set next pointer NULL */
                }
                if (!*ep)                       /* if at end, break */
                    break;
                in = 0;                         /* set in-word flag 0 (false) */
            }
            else {  /* normal word char */
                if (!in)                        /* if not in-word */
                    p = ep;                     /* update start to end-pointer */
                in = 1;                         /* set in-word flag 1 (true) */
            }
            ep++;   /* advance to next character */
        }
    
        return dest;
    }
    
    int main (void) {
    
        char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
            **tokens;                           /* pointer to pointer to char */
    
        if ((tokens = strsplit (str, ","))) {   /* split string into tokens */
            for (char **p = tokens; *p; p++) {  /* loop over filled pointers */
                puts (*p);
                free (*p);      /* don't forget to free allocated strings */
            }
            free (tokens);      /* and pointers */
        }
    }
    

    Example Use/Output

    $ ./bin/splitinput
    JAN
    FEB
    MAR
    APR
    MAY
    JUN
    JUL
    AUG
    SEP
    OCT
    NOV
    DEC
    

    Let me know if you have any further questions.

    0 讨论(0)
  • 2020-11-21 12:20

    This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).

    The function:

    char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
      //replace deliminator's with zeros and count how many
      //sub strings with length >= 1 exist
      num_sub_str = 0;
      char *src_str_tmp = src_str;
      bool found_delim = true;
      while(*src_str_tmp){
        if(*src_str_tmp == deliminator){
          *src_str_tmp = 0;
          found_delim = true;
        }
        else if(found_delim){ //found first character of a new string
          num_sub_str++;
          found_delim = false;
          //sub_str_vec.push_back(src_str_tmp); //for c++
        }
        src_str_tmp++;
      }
      printf("Start - found %d sub strings\n", num_sub_str);
      if(num_sub_str <= 0){
        printf("str_split() - no substrings were found\n");
        return(0);
      }
    
      //if you want to use a c++ vector and push onto it, the rest of this function
      //can be omitted (obviously modifying input parameters to take a vector, etc)
    
      char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
      const char *src_str_terminator = src_str_tmp;
      src_str_tmp = src_str;
      bool found_null = true;
      size_t idx = 0;
      while(src_str_tmp < src_str_terminator){
        if(!*src_str_tmp) //found a NULL
          found_null = true;
        else if(found_null){
          sub_strings[idx++] = src_str_tmp;
          //printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
          found_null = false;
        }
        src_str_tmp++;
      }
      sub_strings[num_sub_str] = NULL;
    
      return(sub_strings);
    }
    

    How to use it:

      char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
      char *str = strdup(months);
      size_t num_sub_str;
      char **sub_strings = str_split(str, ',', num_sub_str);
      char *endptr;
      if(sub_strings){
        for(int i = 0; sub_strings[i]; i++)
          printf("[%s]\n", sub_strings[i]);
      }
      free(sub_strings);
      free(str);
    
    0 讨论(0)
提交回复
热议问题