how do I split a string into array of string? - 【StackMirror】|c|string|split

For example:

input(string): foo $$ foo ## foo []

search(string): foo

output(array): $$ ,## ,[]

I tried it:

char * str = "foo $$ foo ## foo []";
    char * s = "foo";

    int buf_len = 0;
    int len = strlen(s);
    int i = 0;

    char ** buffer = malloc(MAX_BUFFER_SIZE);
    char * tmpbuf = malloc(MAX_BUFFER_SIZE);
    char * p = str;
    char ** buf = buffer;
    char * tbuf = tmpbuf;

    while(*p)
    {
        if(*p == *s) 
        {
            while(*p == *(s + i)) 
            { 
                i++;
                p++;
            }

            if(i == len) 
            {
                *buf ++ = tbuf;
                memset(tbuf,0,buf_len);
                i = buf_len = 0;
            }
        }
        else 
        {
            *tbuf ++= *p;
            buf_len ++;
        }

        p++;
    }

    *buf ++= NULL;

    int x;
    for(x = 0; buffer[x]; x++)
    {
        printf("%s\n", buffer[x]);
    }

    free(buffer);
    free(tmpbuf);

that show the following output:

$$ ## []
## []
[]

but the expected is:

$$  
##  
[]

how to fix this?

2012-04-03 20:10
by Jack

why -1? can explain please - Jack 2012-04-03 20:20

possible duplicate of Split string at # in CBo Persson 2012-04-03 22:19

@BoPersson: No. This is not char question. I want to split string - Jack 2012-04-04 00:39

It is because you do not copy the contents of tbuf to buf when you say:

*buf ++ = tbuf;

What you do is save a reference to the current position in tbuf (or tmpbuf if you like).

tmpbuf get filled with everything but the delimiter.

It is something like, at end of loop:

          01234567 <- offset
tmpbuf = "$$ ## []"

buf[0] = tmpbuf+0;
buf[1] = tmpbuf+3;
buf[2] = tmpbuf+6;

Or very simplified memory table:

        memory
       address        value   
tmpbuf -> 0x01       [   $] <- buffer[0] points here
          0x02       [   $]
          0x03       [    ]
          0x04       [   #] <- buffer[1] points here
          0x05       [   #]
          0x06       [    ]
          0x07       [   [] <- buffer[2] points here
          0x08       [   ]]
          0x09       [    ]
          ...
buffer -> 0x3A       [0x01]
          0x3B       [0x04]
          0x3C       [0x07]
          0x3D       [    ]
          0x3E       [    ]
          ...

EDIT

For the phun of it; a pointer, dynamic, way, not using strstr().

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int is_needle(char *hay, char *needle)
{
    while (*hay && *++hay == *++needle);
    return *needle == '\0';
}

char *find(char *hay, char *needle)
{
    while (*hay) {
        if (*hay == *needle && is_needle(hay, needle))
            return hay;
        ++hay;
    }
    return hay;
}

int pushstr(char ***vs, size_t *vslen, char *val, size_t slen)
{
    char **vsp = *vs + *vslen;

    if ((*vsp = realloc(*(*vs + *vslen), slen + 1)) == NULL) {
        perror("pushstr.1"); exit(1);
    }

    memcpy(*vsp, val, slen);
    *(*vsp + slen) = '\0';

    if ((*vs  = realloc(*vs, sizeof(char*) * (++*vslen + 1))) == NULL) {
        perror("pushstr.2"); exit(1);
    }
    *(*vs + *vslen) = NULL;

    return *vslen;
}

int main(void)
{
    char *hay    = "foo $$ foo ## foo [] fox @@ foo ??";
    char *needle = "foo";
    char *np;
    char **vs;
    size_t vslen = 0;
    size_t nlen  = strlen(needle);

    if ((vs = malloc(sizeof(char*))) == NULL) {
        perror("main");
        return 1;
    }
    *vs = NULL;

    while (*(np = find(hay, needle))) {
        if (np != hay) {
            pushstr(&vs, &vslen, hay, np - hay);
            hay = np + nlen;
        } else {
            hay += nlen;
        }
    }
    if (np != hay)
        pushstr(&vs, &vslen, hay, np - hay);

    while (*vs)
        printf("V: '%s'\n", *vs++);
    vs -= vslen;

    while (*vs)
        free(*vs++);
    vs -= vslen;
    free(vs);

    return 0;
}

2012-04-03 21:58
by Morpfh

Here's a function for splitting a string into an array of strings:

#include <assert.h>
#include <string.h>

/*
 * Split a string by a delimiter.
 *
 * This function writes the beginning of each item to @pointers_out
 * (forming an array of C strings), and writes the actual string bytes
 * to @bytes_out.  Both buffers are assumed to be big enough for all of the
 * strings.
 *
 * Returns the number of strings written to @pointers_out.
 */
size_t explode(const char *delim, const char *str,
               char **pointers_out, char *bytes_out)
{
    size_t  delim_length        = strlen(delim);
    char   **pointers_out_start = pointers_out;

    assert(delim_length > 0);

    for (;;) {
        /* Find the next occurrence of the item delimiter. */
        const char *delim_pos = strstr(str, delim);

        /*
         * Emit the current output buffer position, since that is where the
         * next item will be written.
         */
        *pointers_out++ = bytes_out;

        if (delim_pos == NULL) {
            /*
             * No more item delimiters left.  Treat the rest of the input
             * string as the last item.
             */
            strcpy(bytes_out, str);
            return pointers_out - pointers_out_start;
        } else {
            /*
             * Item delimiter found.  The bytes leading up to it form the next
             * string.
             */
            while (str < delim_pos)
                *bytes_out++ = *str++;

            /* Don't forget the NUL terminator. */
            *bytes_out++ = '\0';

            /* Skip over the delimiter. */
            str += delim_length;
        }
    }
}

Usage:

#include <stdio.h>
/* ... */

#define BIG_ENOUGH 1000

int main(void)
{
    char    *items[BIG_ENOUGH];
    char     item_bytes[BIG_ENOUGH];
    size_t   i;
    size_t   count;

    count = explode("foo", "foo $$ foo ## foo []", items, item_bytes);

    for (i = 0; i < count; i++)
        printf("\"%s\"\n", items[i]);

    return 0;
}

Output:

""
" $$ "
" ## "
" []"

This does not produce the exact output you asked for, as I'm not sure how you want to handle surrounding spaces and occurrences of the item delimiter (in your example, "foo") at the beginning of the string. Instead, I mimicked PHP's explode function.

I'd like to point out how my explode function punts on memory management. It is up to the caller to ensure the buffers are big enough. This is fine for a quick script, but might be annoying in a more serious program, where you'll have to do some math to use this function correctly. I could have written a more "robust" implementation that performs its own allocation, but:

That would clutter the implementation.
It doesn't give the caller the option of using their own memory allocator.

So implementing explode the way I did is "bad" because it is hard to use correctly, and worse, easy to use incorrectly. On the other hand, it is "good" in that it separates the concerns of functionality and memory management.

2012-04-03 21:09
by Joey Adams

This is a task for strstr(). I changed your code a little bit to make use of it.

int add_to_buf(char *str, size_t len, char ***buf)
{
  if (len <= 0) return 0;
  **buf = malloc (len);
  strncpy (**buf, str, len);
  ++*buf;
  return 1;
}

int main()
{
  char *str = "foo $$ foo ## foo []";
  char *s = "foo";

  char **buffer = malloc (MAX_BUFFER_SIZE*sizeof(*buffer)), **buf = buffer;
  char *start, *end;

  int s_len = strlen (s);

  start = str;
  end = strstr (str, s);
  while (end) {
    add_to_buf (start, end-start, &buf);
    start = end + s_len;
    end = strstr (start, s);
  }
  add_to_buf (start, strlen (str) - (start-str), &buf);
  *buf = 0;

  for (buf = buffer; *buf; ++buf)
      printf ("%s\n", *buf);

  free (buffer);
  return 0;
}

2012-04-03 20:50
by ipc

You are using too many pointers for a simple program and the the way you used them makes it hard to understand. One straightforward bug I see is you are using buffer** (array of strings) but you are only allocating a single string. You are this this array of strings to store the tokens, which will do some memory violation somewhere.

Since you want to print the tokens, you don't need to store them in a separate array. This will do:

#include<stdio.h>
#include<string.h>

int main(int ac, char*argv[]) {
char str[] = "foo $$ foo ## foo []";
char * s = "foo";   
char *p;

p = strtok(str, " "); // tokenize

while(p!=NULL)
{
if(strcmp(p, s)) //print non matching tokens
printf("%s\n", p);
p = strtok(NULL, " ");
}

return 0;
}

Note that here the delimiter is whitespace which makes it easier here.

2012-04-03 20:56
by P.P.

The strtok function was designed for this task:

#include <string.h>
...
char *token;
char *line = "LINE TO BE SEPARATED";
char *search = " ";


/* Token will point to "LINE". */
token = strtok(line, search);


/* Token will point to "TO". */
token = strtok(NULL, search);

2012-04-03 20:29
by Karl Bielefeldt

This doesn't work because strtok's delimiter string considers each of its individual characters as valid delimiters. Using "foo" as a strtok delimiter string would split an input like "f $$ f ## f []" into three tokens, which isn't what the OP wants - Matt Eckert 2012-04-03 20:35

Thanks, I didn't notice his multi-character delimiter. Still, using other standard functions like strstr and strcpy is going to make the task a lot clearer - Karl Bielefeldt 2012-04-03 20:46

@KarlBielefeldt: Can you give me a simple example how to do it - Jack 2012-04-04 00:44