How do I write a function to split and return an array for a string with delimiters in the C programming language?
char* str = \"JAN,FEB,MAR,APR,MAY,JUN,JUL,
Here is my two cents:
int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;
while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}
Usage:
char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
String tokenizer this code should put you in the right direction.
int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}
This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer
and stringLengths
will be set to (void *) 0
, and numStrings
will be set to 0
.
This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)
void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}
*numStrings = 1;
for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}
*stringLengths = (int *) malloc(sizeof(int) * *numStrings);
int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}
if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}
*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}
currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}
Sample code:
int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;
splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}
Libraries:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
I think the following solution is ideal:
Explanation of the code:
token
to store the address and lengths of the tokensstr
is made up entirely of separators so there are strlen(str) + 1
tokens, all of them empty stringsstr
recording the address and length of every tokenNULL
sentinel valuememcpy
as it's faster than strcpy
and we know
the lengthstypedef struct {
const char *start;
size_t len;
} token;
char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}
Note malloc
checking omitted for brevity.
In general, I wouldn't return an array of char *
pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.
Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **
). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc
or realloc
without POSIX strdup
.
The initial number of pointers allocated is controlled by the NPTRS
constant and the only limitation is that it be greater than zero. The char **
returned contains a sentinel NULL
after the last token similar to *argv[]
and in the form usable by execv
, execvp
and execve
.
As with strtok()
multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
will be parsed as if only a single ','
separates "MAY,JUN"
.
The function below is commented in-line and a short main()
was added splitting the months. The initial number of pointers allocated was set at 2
to force three reallocation during tokenizing the input string:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NPTRS 2 /* initial number of pointers to allocate (must be > 0) */
/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS; /* index, in/out flag, ptr count */
char **dest = NULL; /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p; /* pointer and end-pointer */
/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL; /* set first pointer as sentinel NULL */
for (;;) { /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) { /* if at nul-char or delimiter char */
size_t len = ep - p; /* get length of token */
if (in && len) { /* in-word and chars in token */
if (i == nptrs - 1) { /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break; /* don't exit, original dest still valid */
}
dest = tmp; /* assign reallocated block to dest */
nptrs *= 2; /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len); /* copy len chars to storage */
dest[i++][len] = 0; /* nul-terminate, advance index */
dest[i] = NULL; /* set next pointer NULL */
}
if (!*ep) /* if at end, break */
break;
in = 0; /* set in-word flag 0 (false) */
}
else { /* normal word char */
if (!in) /* if not in-word */
p = ep; /* update start to end-pointer */
in = 1; /* set in-word flag 1 (true) */
}
ep++; /* advance to next character */
}
return dest;
}
int main (void) {
char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens; /* pointer to pointer to char */
if ((tokens = strsplit (str, ","))) { /* split string into tokens */
for (char **p = tokens; *p; p++) { /* loop over filled pointers */
puts (*p);
free (*p); /* don't forget to free allocated strings */
}
free (tokens); /* and pointers */
}
}
Example Use/Output
$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC
Let me know if you have any further questions.
This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).
The function:
char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}
//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)
char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;
return(sub_strings);
}
How to use it:
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);