繁体   English   中英

在C中使用分隔符拆分字符串

[英]Split string with delimiters in C

如何在 C 编程语言中编写一个函数来拆分并返回带有分隔符的字符串的数组?

char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');

您可以使用strtok()函数来拆分字符串(并指定要使用的分隔符)。 请注意, strtok()将修改传递给它的字符串。 如果在其他地方需要原始字符串,请复制它并将副本传递给strtok()

编辑:

示例(注意它不处理连续的分隔符,例如“JAN,,,FEB,MAR”):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

char** str_split(char* a_str, const char a_delim)
{
    char** result    = 0;
    size_t count     = 0;
    char* tmp        = a_str;
    char* last_comma = 0;
    char delim[2];
    delim[0] = a_delim;
    delim[1] = 0;

    /* Count how many elements will be extracted. */
    while (*tmp)
    {
        if (a_delim == *tmp)
        {
            count++;
            last_comma = tmp;
        }
        tmp++;
    }

    /* Add space for trailing token. */
    count += last_comma < (a_str + strlen(a_str) - 1);

    /* Add space for terminating null string so caller
       knows where the list of returned strings ends. */
    count++;

    result = malloc(sizeof(char*) * count);

    if (result)
    {
        size_t idx  = 0;
        char* token = strtok(a_str, delim);

        while (token)
        {
            assert(idx < count);
            *(result + idx++) = strdup(token);
            token = strtok(0, delim);
        }
        assert(idx == count - 1);
        *(result + idx) = 0;
    }

    return result;
}

int main()
{
    char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    char** tokens;

    printf("months=[%s]\n\n", months);

    tokens = str_split(months, ',');

    if (tokens)
    {
        int i;
        for (i = 0; *(tokens + i); i++)
        {
            printf("month=[%s]\n", *(tokens + i));
            free(*(tokens + i));
        }
        printf("\n");
        free(tokens);
    }

    return 0;
}

输出:

$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]

month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]

我认为strsep仍然是最好的工具:

while ((token = strsep(&str, ","))) my_fn(token);

这实际上是分割字符串的一行。

额外的括号是一种风格元素,表明我们有意测试赋值的结果,而不是相等运算符==

要使该模式起作用, tokenstr都具有char *类型。 如果您从字符串文字开始,那么您首先要复制它:

// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;

tofree = str = strdup(my_str_literal);  // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);

如果两个分隔符一起出现在str中,您将获得一个空字符串的token值。 str的值被修改,因为遇到的每个分隔符都被零字节覆盖 - 这是复制首先被解析的字符串的另一个好理由。

在评论中,有人建议strtokstrsep更好,因为strtok更便携。 Ubuntu 和 Mac OS X 有strsep 可以肯定的是,其他 unixy 系统也可以这样做。 Windows 缺少strsep ,但它有strbrk可以实现这个简短而甜蜜的strsep替换:

char *strsep(char **stringp, const char *delim) {
  if (*stringp == NULL) { return NULL; }
  char *token_start = *stringp;
  *stringp = strpbrk(token_start, delim);
  if (*stringp) {
    **stringp = '\0';
    (*stringp)++;
  }
  return token_start;
}

strsep vs strtok的一个很好的解释。 可以主观判断优劣; 但是,我认为这是一个明显的迹象,表明strsep被设计为strtok的替代品。

字符串标记器此代码应该让您朝着正确的方向前进。

int main(void) {
  char st[] ="Where there is will, there is a way.";
  char *ch;
  ch = strtok(st, " ");
  while (ch != NULL) {
  printf("%s\n", ch);
  ch = strtok(NULL, " ,");
  }
  getch();
  return 0;
}

下面的方法将为您完成所有工作(内存分配,计算长度)。 更多信息和描述可以在这里找到 - Java String.split() 方法来拆分 C 字符串的实现

int split (const char *str, char c, char ***arr)
{
    int count = 1;
    int token_len = 1;
    int i = 0;
    char *p;
    char *t;

    p = str;
    while (*p != '\0')
    {
        if (*p == c)
            count++;
        p++;
    }

    *arr = (char**) malloc(sizeof(char*) * count);
    if (*arr == NULL)
        exit(1);

    p = str;
    while (*p != '\0')
    {
        if (*p == c)
        {
            (*arr)[i] = (char*) malloc( sizeof(char) * token_len );
            if ((*arr)[i] == NULL)
                exit(1);

            token_len = 0;
            i++;
        }
        p++;
        token_len++;
    }
    (*arr)[i] = (char*) malloc( sizeof(char) * token_len );
    if ((*arr)[i] == NULL)
        exit(1);

    i = 0;
    p = str;
    t = ((*arr)[i]);
    while (*p != '\0')
    {
        if (*p != c && *p != '\0')
        {
            *t = *p;
            t++;
        }
        else
        {
            *t = '\0';
            i++;
            t = ((*arr)[i]);
        }
        p++;
    }

    return count;
}

如何使用它:

int main (int argc, char ** argv)
{
    int i;
    char *s = "Hello, this is a test module for the string splitting.";
    int c = 0;
    char **arr = NULL;

    c = split(s, ' ', &arr);

    printf("found %d tokens.\n", c);

    for (i = 0; i < c; i++)
        printf("string #%d: %s\n", i, arr[i]);

    return 0;
}

这是我的两分钱:

int split (const char *txt, char delim, char ***tokens)
{
    int *tklen, *t, count = 1;
    char **arr, *p = (char *) txt;

    while (*p != '\0') if (*p++ == delim) count += 1;
    t = tklen = calloc (count, sizeof (int));
    for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
    *tokens = arr = malloc (count * sizeof (char *));
    t = tklen;
    p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
    while (*txt != '\0')
    {
        if (*txt == delim)
        {
            p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
            txt++;
        }
        else *p++ = *txt++;
    }
    free (tklen);
    return count;
}

用法:

char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";

count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);

/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>

/**
 *  splits str on delim and dynamically allocates an array of pointers.
 *
 *  On error -1 is returned, check errno
 *  On success size of array is returned, which may be 0 on an empty string
 *  or 1 if no delim was found.  
 *
 *  You could rewrite this to return the char ** array instead and upon NULL
 *  know it's an allocation problem but I did the triple array here.  Note that
 *  upon the hitting two delim's in a row "foo,,bar" the array would be:
 *  { "foo", NULL, "bar" } 
 * 
 *  You need to define the semantics of a trailing delim Like "foo," is that a
 *  2 count array or an array of one?  I choose the two count with the second entry
 *  set to NULL since it's valueless.
 *  Modifies str so make a copy if this is a problem
 */
int split( char * str, char delim, char ***array, int *length ) {
  char *p;
  char **res;
  int count=0;
  int k=0;

  p = str;
  // Count occurance of delim in string
  while( (p=strchr(p,delim)) != NULL ) {
    *p = 0; // Null terminate the deliminator.
    p++; // Skip past our new null
    count++;
  }

  // allocate dynamic array
  res = calloc( 1, count * sizeof(char *));
  if( !res ) return -1;

  p = str;
  for( k=0; k<count; k++ ){
    if( *p ) res[k] = p;  // Copy start of string
    p = strchr(p, 0 );    // Look for next null
    p++; // Start of next string
  }

  *array = res;
  *length = count;

  return 0;
}

char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";

int main() {
  char **res;
  int k=0;
  int count =0;
  int rc;

  rc = split( str, ',', &res, &count );
  if( rc ) {
    printf("Error: %s errno: %d \n", strerror(errno), errno);
  }

  printf("count: %d\n", count );
  for( k=0; k<count; k++ ) {
    printf("str: %s\n", res[k]);
  }

  free(res );
  return 0;
}

我认为以下解决方案是理想的:

  • 不破坏源字符串
  • 可重入 - 即,您可以从一个或多个线程中的任何位置安全地调用它
  • 便携的
  • 正确处理多个分隔符
  • 快速高效

代码说明:

  1. 定义一个结构token来存储token的地址和长度
  2. 在最坏的情况下为这些分配足够的内存,即str完全由分隔符组成,因此有strlen(str) + 1标记,它们都是空字符串
  3. 扫描str记录每个token的地址和长度
  4. 使用它来分配正确大小的输出数组,包括用于NULL标记值的额外空间
  5. 使用开始和长度信息分配、复制和添加标记 - 使用memcpy ,因为它比strcpy更快,而且我们知道长度
  6. 释放令牌地址和长度数组
  7. 返回令牌数组
typedef struct {
    const char *start;
    size_t len;
} token;

char **split(const char *str, char sep)
{
    char **array;
    unsigned int start = 0, stop, toks = 0, t;
    token *tokens = malloc((strlen(str) + 1) * sizeof(token));
    for (stop = 0; str[stop]; stop++) {
        if (str[stop] == sep) {
            tokens[toks].start = str + start;
            tokens[toks].len = stop - start;
            toks++;
            start = stop + 1;
        }
    }
    /* Mop up the last token */
    tokens[toks].start = str + start;
    tokens[toks].len = stop - start;
    toks++;
    array = malloc((toks + 1) * sizeof(char*));
    for (t = 0; t < toks; t++) {
        /* Calloc makes it nul-terminated */
        char *token = calloc(tokens[t].len + 1, 1);
        memcpy(token, tokens[t].start, tokens[t].len);
        array[t] = token;
    }
    /* Add a sentinel */
    array[t] = NULL; 
    free(tokens);
    return array;
}

请注意,为简洁起见,省略了malloc检查。

一般来说,我不会从这样的拆分函数中返回一个char *指针数组,因为它让调用者承担了很多正确释放它们的责任。 我更喜欢的接口是允许调用者传递一个回调函数并为每个标记调用它,正如我在这里描述的: Split a String in C

在上面的示例中,将有一种方法可以在字符串中返回一个以空字符结尾的字符串数组(如您所愿)。 但是,它无法传递文字字符串,因为它必须由函数修改:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

char** str_split( char* str, char delim, int* numSplits )
{
    char** ret;
    int retLen;
    char* c;

    if ( ( str == NULL ) ||
        ( delim == '\0' ) )
    {
        /* Either of those will cause problems */
        ret = NULL;
        retLen = -1;
    }
    else
    {
        retLen = 0;
        c = str;

        /* Pre-calculate number of elements */
        do
        {
            if ( *c == delim )
            {
                retLen++;
            }

            c++;
        } while ( *c != '\0' );

        ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
        ret[retLen] = NULL;

        c = str;
        retLen = 1;
        ret[0] = str;

        do
        {
            if ( *c == delim )
            {
                ret[retLen++] = &c[1];
                *c = '\0';
            }

            c++;
        } while ( *c != '\0' );
    }

    if ( numSplits != NULL )
    {
        *numSplits = retLen;
    }

    return ret;
}

int main( int argc, char* argv[] )
{
    const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";

    char* strCpy;
    char** split;
    int num;
    int i;

    strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
    strcpy( strCpy, str );

    split = str_split( strCpy, ',', &num );

    if ( split == NULL )
    {
        puts( "str_split returned NULL" );
    }
    else
    {
        printf( "%i Results: \n", num );

        for ( i = 0; i < num; i++ )
        {
            puts( split[i] );
        }
    }

    free( split );
    free( strCpy );

    return 0;
}

可能有一种更简洁的方法来做到这一点,但你明白了。

此函数接受一个 char* 字符串并通过分隔符将其拆分。 一行中可以有多个分隔符。 请注意,该函数会修改原始字符串。 如果您需要原始字符串保持不变,则必须先复制原始字符串。 此函数不使用任何 cstring 函数调用,因此它可能比其他函数快一点。 如果您不关心内存分配,您可以在函数顶部分配大小为 strlen(src_str)/2 的子字符串,并且(如提到的 c++“版本”)跳过函数的下半部分。 如果这样做,函数会减少到 O(N),但下面显示的内存优化方式是 O(2N)。

功能:

char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
  //replace deliminator's with zeros and count how many
  //sub strings with length >= 1 exist
  num_sub_str = 0;
  char *src_str_tmp = src_str;
  bool found_delim = true;
  while(*src_str_tmp){
    if(*src_str_tmp == deliminator){
      *src_str_tmp = 0;
      found_delim = true;
    }
    else if(found_delim){ //found first character of a new string
      num_sub_str++;
      found_delim = false;
      //sub_str_vec.push_back(src_str_tmp); //for c++
    }
    src_str_tmp++;
  }
  printf("Start - found %d sub strings\n", num_sub_str);
  if(num_sub_str <= 0){
    printf("str_split() - no substrings were found\n");
    return(0);
  }

  //if you want to use a c++ vector and push onto it, the rest of this function
  //can be omitted (obviously modifying input parameters to take a vector, etc)

  char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
  const char *src_str_terminator = src_str_tmp;
  src_str_tmp = src_str;
  bool found_null = true;
  size_t idx = 0;
  while(src_str_tmp < src_str_terminator){
    if(!*src_str_tmp) //found a NULL
      found_null = true;
    else if(found_null){
      sub_strings[idx++] = src_str_tmp;
      //printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
      found_null = false;
    }
    src_str_tmp++;
  }
  sub_strings[num_sub_str] = NULL;

  return(sub_strings);
}

如何使用它:

  char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
  char *str = strdup(months);
  size_t num_sub_str;
  char **sub_strings = str_split(str, ',', num_sub_str);
  char *endptr;
  if(sub_strings){
    for(int i = 0; sub_strings[i]; i++)
      printf("[%s]\n", sub_strings[i]);
  }
  free(sub_strings);
  free(str);

这是一个字符串拆分函数,可以处理多字符分隔符。 请注意,如果分隔符比要拆分的字符串长,则bufferstringLengths将设置为(void *) 0 ,而numStrings将设置为0

该算法已经过测试,并且有效。 (免责声明:未针对非 ASCII 字符串进行测试,假设调用者提供了有效参数)

void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
    const int lo = strlen(original);
    const int ld = strlen(delimiter);
    if(ld > lo){
        *buffer = (void *)0;
        *numStrings = 0;
        *stringLengths = (void *)0;
        return;
    }

    *numStrings = 1;

    for(int i = 0;i < (lo - ld);i++){
        if(strncmp(&original[i], delimiter, ld) == 0) {
            i += (ld - 1);
            (*numStrings)++;
        }
    }

    *stringLengths = (int *) malloc(sizeof(int) * *numStrings);

    int currentStringLength = 0;
    int currentStringNumber = 0;
    int delimiterTokenDecrementCounter = 0;
    for(int i = 0;i < lo;i++){
        if(delimiterTokenDecrementCounter > 0){
            delimiterTokenDecrementCounter--;
        } else if(i < (lo - ld)){
            if(strncmp(&original[i], delimiter, ld) == 0){
                (*stringLengths)[currentStringNumber] = currentStringLength;
                currentStringNumber++;
                currentStringLength = 0;
                delimiterTokenDecrementCounter = ld - 1;
            } else {
                currentStringLength++;
            }
        } else {
            currentStringLength++;
        }

        if(i == (lo - 1)){
            (*stringLengths)[currentStringNumber] = currentStringLength;
        }
    }

    *buffer = (char **) malloc(sizeof(char *) * (*numStrings));
    for(int i = 0;i < *numStrings;i++){
        (*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
    }

    currentStringNumber = 0;
    currentStringLength = 0;
    delimiterTokenDecrementCounter = 0;
    for(int i = 0;i < lo;i++){
        if(delimiterTokenDecrementCounter > 0){
            delimiterTokenDecrementCounter--;
        } else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
            (*buffer)[currentStringNumber][currentStringLength] = 0;
            delimiterTokenDecrementCounter = ld - 1;
            currentStringLength = 0;
            currentStringNumber++;
        } else {
            (*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
            currentStringLength++;
        }
    }
    buffer[currentStringNumber][currentStringLength] = 0;
}

示例代码:

int main(){
    const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
    char **buffer;
    int numStrings;
    int * stringLengths;

    splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);

    for(int i = 0;i < numStrings;i++){
        printf("String: %s\n", buffer[i]);
    }
}

图书馆:

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

此优化方法在 *result 中创建(或更新现有)指针数组并返回 *count 中的元素数。

使用“max”表示您期望的最大字符串数(当您指定现有数组或任何其他原因时),否则将其设置为 0

要与分隔符列表进行比较,请将 delim 定义为 char* 并替换以下行:

if (str[i]==delim) {

使用以下两行:

 char *c=delim; while(*c && *c!=str[i]) c++;
 if (*c) {

享受

#include <stdlib.h>
#include <string.h>

char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
  size_t i;
  char **_result;

  // there is at least one string returned
  *count=1;

  _result= *result;

  // when the result array is specified, fill it during the first pass
  if (_result) {
    _result[0]=str;
  }

  // scan the string for delimiter, up to specified length
  for (i=0; i<len; ++i) {

    // to compare against a list of delimiters,
    // define delim as a string and replace 
    // the next line:
    //     if (str[i]==delim) {
    //
    // with the two following lines:
    //     char *c=delim; while(*c && *c!=str[i]) c++;
    //     if (*c) {
    //       
    if (str[i]==delim) {

      // replace delimiter with zero
      str[i]=0;

      // when result array is specified, fill it during the first pass
      if (_result) {
        _result[*count]=str+i+1;
      }

      // increment count for each separator found
      ++(*count);

      // if max is specified, dont go further
      if (max && *count==max)  {
        break;
      }

    }
  }

  // when result array is specified, we are done here
  if (_result) {
    return _result;
  }

  // else allocate memory for result
  // and fill the result array                                                                                    

  *result=malloc((*count)*sizeof(char*));
  if (!*result) {
    return NULL;
  }
  _result=*result;

  // add first string to result
  _result[0]=str;

  // if theres more strings
  for (i=1; i<*count; ++i) {

    // find next string
    while(*str) ++str;
    ++str;

    // add next string to result
    _result[i]=str;

  }

  return _result;
}  

使用示例:

#include <stdio.h>

int main(int argc, char **argv) {
  char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
  char **result=malloc(6*sizeof(char*));
  char **result2=0;
  unsigned long count;
  unsigned long count2;
  unsigned long i;

  split(strdup(str),strlen(str),',',&result,&count,6);
  split(strdup(str),strlen(str),',',&result2,&count2,0);

  if (result)
  for (i=0; i<count; ++i) {
    printf("%s\n",result[i]);
  }

  printf("\n");

  if (result2)
  for (i=0; i<count2; ++i) {
    printf("%s\n", result2[i]);
  }

  return 0;

}

下面是我的zString library中的strtok()实现。 zstring_strtok()与标准库的strtok()处理连续分隔符的方式不同。

看看下面的代码,确保你会了解它是如何工作的(我尝试使用尽可能多的注释)

char *zstring_strtok(char *str, const char *delim) {
    static char *static_str=0;      /* var to store last address */
    int index=0, strlength=0;       /* integers for indexes */
    int found = 0;                  /* check if delim is found */

    /* delimiter cannot be NULL
    * if no more char left, return NULL as well
    */
    if (delim==0 || (str == 0 && static_str == 0))
        return 0;

    if (str == 0)
        str = static_str;

    /* get length of string */
    while(str[strlength])
        strlength++;

    /* find the first occurance of delim */
    for (index=0;index<strlength;index++)
        if (str[index]==delim[0]) {
            found=1;
            break;
        }

    /* if delim is not contained in str, return str */
    if (!found) {
        static_str = 0;
        return str;
    }

    /* check for consecutive delimiters
    *if first char is delim, return delim
    */
    if (str[0]==delim[0]) {
        static_str = (str + 1);
        return (char *)delim;
    }

    /* terminate the string
    * this assignmetn requires char[], so str has to
    * be char[] rather than *char
    */
    str[index] = '\0';

    /* save the rest of the string */
    if ((str + index + 1)!=0)
        static_str = (str + index + 1);
    else
        static_str = 0;

        return str;
}

下面是一个示例用法...

  Example Usage
      char str[] = "A,B,,,C";
      printf("1 %s\n",zstring_strtok(s,","));
      printf("2 %s\n",zstring_strtok(NULL,","));
      printf("3 %s\n",zstring_strtok(NULL,","));
      printf("4 %s\n",zstring_strtok(NULL,","));
      printf("5 %s\n",zstring_strtok(NULL,","));
      printf("6 %s\n",zstring_strtok(NULL,","));

  Example Output
      1 A
      2 B
      3 ,
      4 ,
      5 C
      6 (null)

该库可以从 Github https://github.com/fnoyanisi/zString下载

我的版本:

int split(char* str, const char delimeter, char*** args) {
    int cnt = 1;
    char* t = str;

    while (*t == delimeter) t++;

    char* t2 = t;
    while (*(t2++))
        if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;

    (*args) = malloc(sizeof(char*) * cnt);

    for(int i = 0; i < cnt; i++) {
        char* ts = t;
        while (*t != delimeter && *t != 0) t++;

        int len = (t - ts + 1);
        (*args)[i] = malloc(sizeof(char) * len);
        memcpy((*args)[i], ts, sizeof(char) * (len - 1));
        (*args)[i][len - 1] = 0;

        while (*t == delimeter) t++;
    }

    return cnt;
}

试试用这个。

char** strsplit(char* str, const char* delim){
    char** res = NULL;
    char*  part;
    int i = 0;

    char* aux = strdup(str);

    part = strdup(strtok(aux, delim));

    while(part){
        res = (char**)realloc(res, (i + 1) * sizeof(char*));
        *(res + i) = strdup(part);

        part = strdup(strtok(NULL, delim));
        i++;
    }

    res = (char**)realloc(res, i * sizeof(char*));
    *(res + i) = NULL;

    return res;
}

Explode & implode - 初始字符串保持不变,动态内存分配

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

typedef struct
{
    uintptr_t   ptr;
    int         size;
} token_t;

int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
    int i = 0, c1 = 0, c2 = 0;

    for(i = 0; i <= slen; i++)
    {
            if(str[i] == *delimiter)
            {
                c1++;
            }
    }

    if(c1 == 0)
    {
            return -1;
    }

    *tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
    ((*tokens)[c2]).ptr = (uintptr_t)str;

    i = 0; 
    while(i <= slen)
    {
        if((str[i] == *delimiter) || (i == slen))
        {
                ((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
                if(i < slen)
                {
                    c2++;
                    ((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
                }
        }
        i++;
    }
    return (c1 + 1);
}

char* implode(token_t *tokens, int size, const char *delimiter)
{
    int     i, len = 0;
    char    *str;

    for(i = 0; i < len; i++)
    {
        len += tokens[i].size + 1;
    }

    str = (char*)calloc(len, sizeof(char));

    len = 0;
    for(i = 0; i < size; i++)
    {
        memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
        len += tokens[i].size;
        str[(len++)] = *delimiter;
    }

    str[len - 1] = '\0';

    return str;
}

用法:

int main(int argc, char **argv)
{
    int         i, c;
    char        *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    token_t     *tokens;
    char        *imp;

    printf("%s\n", exp);

    if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
    {
        imp = implode(tokens, c, ",");
        printf("%s\n", imp);

        for(i = 0; i < c; i++)
        {
            printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
        }
    }

    free((void*)tokens);
    free((void*)imp);
    return 0;
}

如果你愿意使用外部库,我推荐bstrlib是不够的。 它需要一些额外的设置,但从长远来看更容易使用。

例如,拆分下面的字符串,首先使用bfromcstr()调用创建一个bstring bstring是 char 缓冲区的包装器)。 接下来,用逗号分割字符串,将结果保存在struct bstrList中,其中包含字段qty和数组entry ,它是bstring的数组。

bstrlib有许多其他函数可以对bstring进行操作

非常简单...

#include "bstrlib.h"
#include <stdio.h>
int main() {
  int i;
  char *tmp = "Hello,World,sak";
  bstring bstr = bfromcstr(tmp);
  struct bstrList *blist = bsplit(bstr, ',');
  printf("num %d\n", blist->qty);
  for(i=0;i<blist->qty;i++) {
    printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
  }

}

我知道的派对迟到了,但这里还有 2 个功能可以使用,并且可能会进一步调整以满足您的需求(帖子底部的源代码

另请参阅下面的实施说明,以确定哪个功能更适合您的需求。

#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>  // C99

// tokenize destructively
char **str_toksarray_alloc(
    char **strp,       /* InOut: pointer to the source non-constant c-string */
    const char *delim, /* c-string containing the delimiting chars */
    size_t *ntoks,     /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
    bool keepnulls     /* false ignores empty tokens, true includes them */
    );

// tokenize non-destructively
char **str_toksarray_alloc2(
   const char *str,    /* the source c-string */
   const char *delim,
   size_t *ntoks,
   bool keepnulls
   );

使用说明

它们的原型几乎相同,除了源字符串(分别为strpstr )。

strp (指向字符串的指针)是已分配的非常量 c 字符串的地址,要就地标记化。 str是一个未更改的 c 字符串(它甚至可以是字符串文字)。 c-string我的意思是一个以nul结尾的字符缓冲区。 两个函数的其余参数相同。

要解析所有可用的令牌, ntoks静音(意味着在将其传递给任何函数之前将其设置为 0 或将其作为NULL指针传递)。 否则,函数会解析到*ntoks标记,或者直到没有更多标记(以先到者为准)。 在任何情况下,当ntoks non-NULL时,它会更新成功解析令牌的计数。

另请注意,非静音ntoks确定将分配多少指针。 因此,如果源字符串包含 10 个标记并且我们将ntoks设置为 1000,我们最终将得到 990 个不必要的分配指针。 另一方面,如果源字符串包含 1000 个标记,但我们只需要前 10 个, ntoks设置为 10 听起来是一个更明智的选择。

这两个函数都分配并返回一个 char-pointers 数组,但是str_toksarray_alloc()使它们指向修改后的源字符串本身中的标记,而str_toksarray_alloc2()使它们指向动态分配的标记副本(最后是 2其名称表示 2 级分配)。

返回的数组附加了一个NULL哨兵指针,在ntoks的回传值中不考虑该指针(否则,当non-NULL时, ntoks将返回数组的长度而不是其1 级大小)。

keepnulls设置为true时,生成的标记类似于我们对strsep()函数的期望。 主要意味着源字符串中的连续定界符产生空标记(null),如果delim是一个空的 c 字符串或在源字符串中没有找到它包含的定界符,则结果只有 1 个标记:源细绳。 strsep()相反,可以通过将keepnulls设置为false来忽略空标记。

可以通过检查函数的返回值与NULL或通过检查ntoks的回传值与 0 (假设ntoks non-NULL )来识别函数的失败调用。 我建议在尝试访问返回的数组之前始终检查失败,因为这些函数包括健全性检查,可以推迟否则立即崩溃(例如,将NULL指针作为源字符串传递)。

成功后,调用者应该在完成后释放数组。 对于str_toksarray_alloc() ,一个简单的free()就足够了。 对于str_toksarray_alloc2() ,由于第二级分配,涉及一个循环。 NULL哨兵( non-NULL ntoks的回传值)使这变得微不足道,但我还在下面为所有懒惰的蜜蜂提供了一个toksarray_free2()函数:)

下面是使用这两个函数的简化示例。

准备:

const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;

str_toksarray_alloc():

// destructive (use copy of src)

char *scopy = strdup( src );
if (!scopy) { ... };          // handle strdup failure

printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
    for (int i=0; arrtoks[i]; i++) {
        printf( "%d: %s\n", i, arrtoks[i] );
    }
}
free( scopy );
free( arrtoks );

/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
 */

str_toksarray_alloc2():

// non-destructive

keepnulls = false;    // reject empty tokens

printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
    for (int i=0; arrtoks[i]; i++) {
        printf( "%d: %s\n", i, arrtoks[i] );
    }
}
toksarray_free2( arrtoks );                     // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks );    // non-dangling artoks

/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/

实施说明

这两个函数都使用strsep()进行标记化,这使它们成为线程安全的,但它不是标准函数。 如果未提供,您始终可以使用开源实现(例如GNUApple 的)。 str_toksarray_alloc2()中使用的函数strdup()也是如此(它的实现很简单,但这里还是GNUApple 的例子)。

在 str_toksarray_alloc() 中使用strsep()str_toksarray_alloc()是源字符串的起始指针在解析循环的每一步中不断移动到下一个标记。 这意味着调用者将无法释放已解析的字符串,除非他们已将起始地址保存到额外的指针。 我们通过使用strpSaved指针在函数中本地执行此操作,为他们省去了麻烦。 str_toksarray_alloc2()不受此影响,因为它不接触源字符串。

这两个函数之间的主要区别是str_toksarray_alloc()不为找到的令牌分配内存。 它只是为数组指针分配空间,并将它们设置为直接指向源字符串。 这是有效的,因为strsep() nul -就地终止找到的标记。 这种依赖性会使您的支持代码复杂化,但对于大字符串,它也会对性能产生很大影响。 如果保留源字符串并不重要,那么它也会对内存占用产生很大影响。

另一方面, str_toksarray_alloc2()分配并返回一个由动态分配的令牌副本组成的自我维持数组,没有进一步的依赖关系。 它首先通过从源字符串的本地副本创建数组,然后将实际令牌内容复制到数组中来实现。 str_toksarray_alloc()相比,这要慢得多并且留下更大的内存占用,但它没有进一步的依赖关系,并且对源字符串的性质没有特殊要求。 这使得编写更简单(因此更易于维护)的支持代码变得更加容易。

这两个函数之间的另一个区别是当ntoks静音时的第一级分配(数组指针)。 它们都解析所有可用的令牌,但它们采用完全不同的方法。 str_toksarray_alloc()使用初始大小为 16(字符指针)的 alloc-ahead,在解析循环中按需加倍。 str_toksarray_alloc2()进行第一遍计算所有可用令牌,然后它只分配一次那么多字符指针。 第一次通过使用标准函数strpbrk()strchr()的辅助函数str_toksfound()完成。 我也在下面提供该函数的源代码。

哪种方法更好由您决定,具体取决于您的项目需求。 随意将每个函数的代码调整为任一方法并从那里获取。

我想说的是,平均而言,对于非常大的字符串,alloc-ahead 的速度要快得多,尤其是当初始大小和增长因子根据每个案例进行微调时(例如,使它们成为函数参数)。 用所有那些strchr()strpbrk()保存额外的通行证可以在那里有所作为。 然而,对于相对较小的字符串,这几乎是常态,提前分配一堆字符指针只是一种矫枉过正。 这并没有什么坏处,但在这种情况下它确实会无缘无故地弄乱代码。 无论如何,请随意选择最适合您的。

这两个功能也是如此。 我想说在大多数情况下str_toksarray_alloc2()处理起来要简单得多,因为内存和性能很少是中小型字符串的问题。 如果您必须处理巨大的字符串,请考虑使用str_toksarray_alloc() (尽管在这些情况下,您应该使用专门的字符串解析函数,接近您的项目需求和输入规范)。

哦,男孩,我认为这不仅仅是 2 美分(笑)。

无论如何,这里是 2 个函数和辅助函数的代码(我已经删除了他们的大部分描述注释,因为我已经涵盖了几乎所有内容)。

源代码

str_toksarray_alloc():

// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
    // sanity checks
    if ( !strp || !*strp || !**strp || !delim ) {
        goto failed;
    }

    char *strpSaved = *strp;                    // save initial *strp pointer
    bool ntoksOk = (ntoks && *ntoks);           // false when ntoks is muted
    size_t _ntoks = (ntoksOk ? *ntoks : 16);    // # of tokens to alloc-ahead

    // alloc array of char-pointers (+1 for NULL sentinel)
    char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
    if ( !toksarr ) {
        goto failed;
    }

    // Parse *strp tokens into the array
    size_t i = 0;           // # of actually parsed tokens
    char *tok;
    while ( (tok = strsep(strp, delim)) ) {
        // if requested, ignore empty tokens
        if ( *tok == '\0' && !keepnulls ) {
            continue;
        }
        // non-muted ntoks reached? we are done
        if ( ntoksOk && i == _ntoks ) {
            *ntoks = i;
            break;
        }
        // muted ntoks & ran out of space? double toksarr and keep parsing
        if ( !ntoksOk && i == _ntoks ) {
            _ntoks *= 2;
            char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
            if ( !tmparr ) {
                *strp = strpSaved;
                free( toksarr );
                goto failed;
            }
            toksarr = tmparr;
        }
        toksarr[i++] = tok; // get token address
    }
    toksarr[i] = NULL;      // NULL sentinel

    *strp = strpSaved;      // restore initial *strp pointer
    if (ntoks) *ntoks = i;  // pass to caller # of parsed tokens
    return toksarr;

failed:
    if (ntoks) *ntoks = 0;
    return NULL;
}

str_toksarray_alloc2():

// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
    // sanity checks
    if ( !str || !*str || !delim ) {
        if (ntoks) *ntoks = 0;
        return NULL;
    }

    // make a copy of str to work with
    char *_str = strdup( str ); 
    if ( !_str ) {
        if (ntoks) *ntoks = 0;
        return NULL;
    }

    // if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
    size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
    if ( _ntoks == 0 ) {        // str_tokscount() failed
        goto fail_free_str;
    }
    
    // alloc the array of strings (+1 for an extra NULL sentinel)
    char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
    if ( !toksarr ) {
        goto fail_free_str;
    }

    // Parse str tokens and duplicate them into the array
    size_t i = 0;           // # of actually parsed tokens
    char *tok;
    while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
        // if requested, skip empty tokens
        if ( *tok == '\0' && !keepnulls ) {
            continue;
        }
        // duplicate current token into the array
        char *tmptok = strdup( tok );
        if ( !tmptok ) {
            goto fail_free_arr;
        }
        toksarr[i++] = tmptok;
    }
    toksarr[i] = NULL;      // NULL sentinel

    free( _str );           // release the local copy of the source-string
    if (ntoks) *ntoks = i;  // pass to caller the # of parsed tokens
    return toksarr;

// cleanup before failing
fail_free_arr:
    for (size_t idx=0; idx < i; idx++) {
        free( toksarr[idx] );
    }
    free( toksarr );

fail_free_str:
    free( _str );
    if (ntoks) *ntoks = 0;
    return NULL;
}

str_tokscount() - 辅助函数,由str_toksarr_alloc2()使用:

// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
    // sanity checks
    if ( !str || !*str || !delim ) {
        return 0;
    }

    const char *tok = str;
    size_t nnulls = strchr(delim, *str) ? 1 : 0;
    size_t ntoks = 1;   // even when no delims in str, str counts as 1 token 
    for (; (str = strpbrk(tok, delim)); ntoks++ ) {
        tok = ++str;
        if ( strchr(delim, *str) ) {
            nnulls++;
        }
    }

    return keepnulls ? ntoks : (ntoks - nnulls);
}

toksarray_free2() - 在str_toksarr_alloc2()返回的数组上使用它:

// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
//      e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
    if ( toksarr ) {
        char **toks = toksarr;
        while ( *toks ) {   // walk until NULL sentinel
            free( *toks++ );
        }
        free( toksarr );
    }

    return NULL;
}

strtok()strsep()都修改输入字符串。 我们可以使用strspn()strpbrk()编写一个函数来根据分隔符拆分字符串。

算法:

  1. 如果输入字符串不为空,则转到步骤 2,否则返回null
  2. 跳过分隔符,如果在字符串的开头有的话,并记录单词的开始位置(为此使用strspn() ),称之为start
  3. 从上一步中找到的当前起点(为此使用strpbrk()查找下一个分隔符位置(或字符串结尾,如果不存在更多分隔符),将其称为end
  4. 在该内存中分配内存并start end字符串。
  5. 返回令牌。

优势:

  1. 线程安全。
  2. 处理多个分隔符。
  3. 便携的。
  4. 不会像strtok()strsep()那样修改输入字符串。

执行:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/*
 * alloc_str function allocates memory and copy substring
 * to allocated memory.
 */

static char * alloc_str (const char * start, const char * end) {
    if (!start || !end || (start >= end)) {
        return NULL;
    }

    char * tmp = malloc (end - start + 1);
    if (tmp) {
        memcpy (tmp, start, end - start);
        tmp[end - start] = '\0';
    } else {
        fprintf (stderr, "Failed to allocate memory\n");
        exit (EXIT_FAILURE);
    }

    return tmp;
}

/*
 * str_split function returns the next token which is sequences of contiguous
 * characters separated by any of the characters that are part of delimiters.
 *
 * Parameters: 
 * p_str : Address of pointer to the string that you want to split.
 * sep : A set of characters that delimit the pieces in the string.
 *
 * Behaviour is undefined if sep is not a pointer to a null-terminated string. 
 *
 * Return :
 * Returns the pointer to dynamically allocated memory where the token is copied.
 * If p_str is NULL or empty string, NULL is returned.
 */

char * str_split (char ** p_str, const char * sep) {
    char * token = NULL;

    if (*p_str && **p_str) {
        char * p_end;

        // skip separator
        *p_str += strspn(*p_str, sep);

        p_end = *p_str;

        // find separator
        p_end = strpbrk (p_end, sep);

        // strpbrk() returns null pointer if no such character
        // exists in the input string which is part of sep argument.
        if (!p_end) {
            p_end = *p_str + strlen (*p_str);
        }

        token = alloc_str (*p_str, p_end);
        *p_str = p_end;
    }

    return token;
}

/*==================================================*/
/*==================================================*/

/*
 * Just a helper function
 */

void token_helper (char * in_str, const char * delim) {
    printf ("\nInput string : ");

    if (in_str) printf ("\"%s\"\n", in_str);
    else printf ("NULL\n");

    if (delim) printf ("Delimiter : \"%s\"\n", delim);

    char * ptr = in_str;
    char * token = NULL;

    printf ("Tokens:\n");
    while ((token = str_split(&ptr, delim)) != NULL) {
        printf ("-> %s\n", token);
        /* You can assign this token to a pointer of an array of pointers
         * and return that array of pointers from this function.
         * Since, this is for demonstration purpose, I am 
         * freeing the allocated memory now.
         */
        free (token);
    }
}

/*
 * Driver function
 */

int main (void) {
    /* test cases */

    char string[100] = "hello world!";
    const char * delim = " ";
    token_helper (string, delim);

    strcpy (string, " hello world,friend of mine!");
    delim = " ,";
    token_helper (string, delim);

    strcpy (string, "Another string");
    delim = "-!";
    token_helper (string, delim);

    strcpy (string, "   one  more   -- string  !");
    delim = "- !";
    token_helper (string, delim); 

    strcpy (string, "");
    delim = " ";
    token_helper (string, delim);

    token_helper (NULL, "");

    strcpy (string, "hi");
    delim = " -$";
    token_helper (string, delim);

    strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
    delim = "cp";
    token_helper (string, delim);

    strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
    delim = ",";
    token_helper (string, delim);

    return 0;
}

输出:

# ./a.out

Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!

Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!

Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string

Input string : "   one  more   -- string  !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string

Input string : ""
Delimiter : " "
Tokens:

Input string : NULL
Delimiter : ""
Tokens:

Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi

Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give 
-> a
-> a a 
-> u
->  of 
-> ro
-> er 
-> offee in a 
-> o
-> er 
-> offee 
-> u
-> .

Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC

我的方法是扫描字符串并让指针指向分隔符(和第一个字符)之后的每个字符,同时将字符串中分隔符的外观分配给'\0'。
首先制作原始字符串的副本(因为它是常量),然后通过扫描获取拆分数,将其传递给指针参数len 之后,将第一个结果指针指向复制字符串指针,然后扫描复制字符串:一旦遇到分隔符,将其分配给 '\0' 从而终止前一个结果字符串,并将下一个结果字符串指针指向下一个字符指针。

char** split(char* a_str, const char a_delim, int* len){
    char* s = (char*)malloc(sizeof(char) * strlen(a_str));
    strcpy(s, a_str);
    char* tmp = a_str;
    int count = 0;
    while (*tmp != '\0'){
        if (*tmp == a_delim) count += 1;
        tmp += 1;
    }
    *len = count;
    char** results = (char**)malloc(count * sizeof(char*));
    results[0] = s;
    int i = 1;
    while (*s!='\0'){
        if (*s == a_delim){
            *s = '\0';
            s += 1;
            results[i++] = s;
        }
        else s += 1;
    }
    return results;
}

我的代码(经过测试):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
  int i=0;
  char *token;
  char **res = (char **) malloc(0 * sizeof(char *));

  /* get the first token */
   token = strtok(str, delim);
   while( token != NULL ) 
   {
        res = (char **) realloc(res, (i + 1) * sizeof(char *));
        res[i] = token;
        i++;
      token = strtok(NULL, delim);
   }
   *array = res;
   *length = i;
  return 1;
}

int main()
{
    int i;
    int c = 0;
    char **arr = NULL;

    int count =0;

    char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    c = dtmsplit(str, ",", &arr, &count);
    printf("Found %d tokens.\n", count);

    for (i = 0; i < count; i++)
        printf("string #%d: %s\n", i, arr[i]);

   return(0);
}

结果:

Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC

另一个答案(从这里移到这里 ):

尝试使用strtok函数:

此处此处查看有关此主题的详细信息

这里的问题是您必须立即处理words 如果要将其存储在数组中,则必须为其分配correct size ,否则它是未知的。

因此,例如:

char **Split(char *in_text, char *in_sep)
{
    char **ret = NULL;
    int count = 0;
    char *tmp = strdup(in_text);
    char *pos = tmp;

    // This is the pass ONE: we count 
    while ((pos = strtok(pos, in_sep)) != NULL)
    {
        count++;
        pos = NULL;
    }

    // NOTE: the function strtok changes the content of the string! So we free and duplicate it again! 
    free(tmp);
    pos = tmp = strdup(in_text);

    // We create a NULL terminated array hence the +1
    ret = calloc(count+1, sizeof(char*));
    // TODO: You have to test the `ret` for NULL here

    // This is the pass TWO: we store
    count = 0;
    while ((pos = strtok(pos, in_sep)) != NULL)
    {
        ret[count] = strdup(pos);
        count++;
        pos = NULL;
    }
    free(tmp);

    return count;
}

// Use this to free
void Free_Array(char** in_array)
{
    char *pos = in_array;

    while (pos[0] != NULL)
    {
        free(pos[0]);
        pos++;

    }

    free(in_array);

}

注意 :为了避免分配问题,我们使用相同的循环和函数来计算计数(通过一遍)并制作副本(通过第二遍)。

注意2 :您可以在单独的帖子中使用strtok的其他实现原因。

您可以这样使用:

int main(void)
{
  char **array = Split("Hello World!", " ");
  // Now you have the array
  // ...

  // Then free the memory
  Free_Array(array);
  array = NULL;
  return 0;
}

(我没有对其进行测试,所以如果不起作用请通知我!)

围绕这个问题的两个问题是内存管理和线程安全。 正如您从众多帖子中看到的那样,在 C 中无缝完成这不是一项容易的任务。我想要一个解决方案:

  • 线程安全。 (strtok 不是线程安全的)
  • 不使用 malloc 或其任何衍生物(以避免内存管理问题)
  • 检查单个字段的数组边界(以避免未知数据的段错误)
  • 适用于多字节字段分隔符 (utf-8)
  • 忽略输入中的额外字段
  • 为无效字段长度提供软错误程序

我提出的解决方案符合所有这些标准。 与此处发布的其他一些解决方案相比,设置可能需要更多的工作,但我认为在实践中,为了避免其他解决方案的常见陷阱,额外的工作是值得的。

#include <stdio.h>
#include <string.h>

struct splitFieldType {
    char *field;
    int   maxLength;
};

typedef struct splitFieldType splitField;

int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual))  {
    int i;
    int fieldSeparatorLen=strlen(fieldSeparator);
    const char *tNext, *tLast=input;

    for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
        int len=tNext-tLast;
        if (len>=fields[i].maxLength) {
            softError(i,fields[i].maxLength-1,len);
            len=fields[i].maxLength-1;
        }
        fields[i].field[len]=0;
        strncpy(fields[i].field,tLast,len);
        tLast=tNext+fieldSeparatorLen;
    }
    if (i<expected) {
        if (strlen(tLast)>fields[i].maxLength) {
            softError(i,fields[i].maxLength,strlen(tLast));
        } else {
            strcpy(fields[i].field,tLast);
        }
        return i+1;
    } else {
        return i;
    }
}


void monthSplitSoftError(int fieldNumber, int expected, int actual) {
    fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}


int main() {
  const char *fieldSeparator=",";
  const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";

  struct monthFieldsType {
    char field1[4];
    char field2[4];
    char field3[4];
    char field4[4];
    char field5[4];
    char field6[4];
    char field7[4];
    char field8[4];
    char field9[4];
    char field10[4];
    char field11[4];
    char field12[4];
  } monthFields;

  splitField inputFields[12] = {
    {monthFields.field1,  sizeof(monthFields.field1)},
    {monthFields.field2,  sizeof(monthFields.field2)},
    {monthFields.field3,  sizeof(monthFields.field3)},
    {monthFields.field4,  sizeof(monthFields.field4)},
    {monthFields.field5,  sizeof(monthFields.field5)},
    {monthFields.field6,  sizeof(monthFields.field6)},
    {monthFields.field7,  sizeof(monthFields.field7)},
    {monthFields.field8,  sizeof(monthFields.field8)},
    {monthFields.field9,  sizeof(monthFields.field9)},
    {monthFields.field10, sizeof(monthFields.field10)},
    {monthFields.field11, sizeof(monthFields.field11)},
    {monthFields.field12, sizeof(monthFields.field12)}
  };

  int expected=sizeof(inputFields)/sizeof(splitField);

  printf("input data: %s\n", input);
  printf("expecting %d fields\n",expected);

  int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);

  if (ct!=expected) {
    printf("string split %d fields, expected %d\n", ct,expected);
  }

  for (int i=0;i<expected;++i) {
    printf("field %d: %s\n",i+1,inputFields[i].field);
  }

  printf("\n");
  printf("Direct structure access, field 10: %s", monthFields.field10);
}

下面是一个示例编译和输出。 请注意,在我的示例中,我特意拼出了“APRIL”,以便您可以看到软错误是如何工作的。

$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC

Direct structure access, field 10: OCT

享受!

这是另一个实现,它将安全地操作以标记与问题中请求的原型匹配的字符串文字,返回分配的指向 char 的指针(例如char ** )。 分隔符字符串可以包含多个字符,输入字符串可以包含任意数量的标记。 所有分配和重新分配都由mallocrealloc处理,没有 POSIX strdup

分配的初始指针数由NPTRS常量控制,唯一的限制是它大于零。 返回的char **在类似于*argv[]的最后一个标记之后包含一个标记NULL ,并且采用execvexecvpexecve可用的形式。

strtok()一样,多个连续分隔符被视为单个分隔符,因此"JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC"将被解析为好像只有一个单个','分隔"MAY,JUN"

下面的函数是在线注释的,并且添加了一个简短的main()来分割月份。 分配的初始指针数设置为2 ,以在对输入字符串进行标记期间强制进行三个重新分配:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define NPTRS 2     /* initial number of pointers to allocate (must be > 0) */

/* split src into tokens with sentinel NULL after last token.
 * return allocated pointer-to-pointer with sentinel NULL on success,
 * or NULL on failure to allocate initial block of pointers. The number
 * of allocated pointers are doubled each time reallocation required.
 */
char **strsplit (const char *src, const char *delim)
{
    int i = 0, in = 0, nptrs = NPTRS;       /* index, in/out flag, ptr count */
    char **dest = NULL;                     /* ptr-to-ptr to allocate/fill */
    const char *p = src, *ep = p;           /* pointer and end-pointer */

    /* allocate/validate nptrs pointers for dest */
    if (!(dest = malloc (nptrs * sizeof *dest))) {
        perror ("malloc-dest");
        return NULL;
    }
    *dest = NULL;   /* set first pointer as sentinel NULL */

    for (;;) {  /* loop continually until end of src reached */
        if (!*ep || strchr (delim, *ep)) {  /* if at nul-char or delimiter char */
            size_t len = ep - p;            /* get length of token */
            if (in && len) {                /* in-word and chars in token */
                if (i == nptrs - 1) {       /* used pointer == allocated - 1? */
                    /* realloc dest to temporary pointer/validate */
                    void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
                    if (!tmp) {
                        perror ("realloc-dest");
                        break;  /* don't exit, original dest still valid */
                    }
                    dest = tmp;             /* assign reallocated block to dest */
                    nptrs *= 2;             /* increment allocated pointer count */
                }
                /* allocate/validate storage for token */
                if (!(dest[i] = malloc (len + 1))) {
                    perror ("malloc-dest[i]");
                    break;
                }
                memcpy (dest[i], p, len);   /* copy len chars to storage */
                dest[i++][len] = 0;         /* nul-terminate, advance index */
                dest[i] = NULL;             /* set next pointer NULL */
            }
            if (!*ep)                       /* if at end, break */
                break;
            in = 0;                         /* set in-word flag 0 (false) */
        }
        else {  /* normal word char */
            if (!in)                        /* if not in-word */
                p = ep;                     /* update start to end-pointer */
            in = 1;                         /* set in-word flag 1 (true) */
        }
        ep++;   /* advance to next character */
    }

    return dest;
}

int main (void) {

    char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
        **tokens;                           /* pointer to pointer to char */

    if ((tokens = strsplit (str, ","))) {   /* split string into tokens */
        for (char **p = tokens; *p; p++) {  /* loop over filled pointers */
            puts (*p);
            free (*p);      /* don't forget to free allocated strings */
        }
        free (tokens);      /* and pointers */
    }
}

示例使用/输出

$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC

如果您还有其他问题,请告诉我。

#include <cstring>
#include <cstdio>
int main()
{
    char buf[] = "This is Luke Skywalker    here!";
    for( char* tok = strtok( buf, " ");
         tok != nullptr;
         tok = strtok( nullptr, " ")) {
        puts( tok);
    }
}

输出

This
is
Luke
Skywalker
here!

我试着做一个非常简单的。 我还在 main() 中展示了示例。

#include <stdio.h>
#include <string.h>

void split(char* inputArr, char** outputArr, char* delim) {
    
        char *temp;
        temp = strtok(inputArr, delim);

        for(int i = 0; temp != NULL; i++) {
            outputArr[i] = temp;
            temp = strtok(NULL, delim);
        }
}

int main(int argc, char **argv){
    
    /* check for proper arguments */
    
    if(argc != 2){
        printf("One Argument Expected\n");
    } else {

        printf("\n");
        /*---------main code starts here----------*/
        FILE * myScriptFile;
        myScriptFile = fopen(argv[1], "r");
        
        /* read txt file and split into array like java split() */
        
        int bufferLen = 100;
        char buffer[bufferLen];
        
        char *splitArr[100];        

        while(fgets(buffer, bufferLen, myScriptFile) != NULL){
            
            split(buffer, splitArr, " ");

            printf("Index 0 String: %s\n", splitArr[0]);
            printf("Index 1 String: %s\n", splitArr[1]);
            printf("Index 2 String: %s\n", splitArr[2]);
            printf("Index 3 String: %s\n", splitArr[3]);
        }
        fclose(myScriptFile);
    }
    printf("\nProgram-Script Ended\n");
    return 0;
}

假设一个 .txt 文件有

Hello this is test
Hello2 this is test2

使用 .txt 文件作为参数运行它会给出

Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test

Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

遇到这个寻找一个简单的解决方案。 我对所有选项都很着迷,但对我自己的用例/品味不满意(这可能很糟糕)。

我创建了一个有点独特的解决方案,旨在为其用户提供清晰的行为,而不是重新分配任何内存,并且是人类可读的 + 带有注释。

在这里上传到 gist.github: https ://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093

例子:

#include "./strutils.c"

struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";

str_split_begin(&info);

char * substr;

for (int i=0; i<info.splitStringsCount; i++) {
  substr = info.splitStrings[i];
  printf("substring: '%s'\n", substr);
}

str_split_end(&info);

输出:

$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'

strutils.c 的完整源代码

#ifndef STRUTILS_C
#define STRUTILS_C 1

#ifndef str
#define str char *
#endif

#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

#include <stdio.h>

struct str_split_info {
  /* The string to be split
  * Provided by caller of str_split_begin function
  */
  str source;
  /* The string that cuts the source string, all occurances of
  * this string will be removed from the source string

  * Provided by caller of str_split_begin function
  */
  str delimiter;

  /* Array of strings split by delimiter
  * Provided and allocated by str_split_begin function
  * Must be garbage collected by str_split_end function
  */
  str * splitStrings;

  /* Array of string lengths split by delimiter
    * Provided and allocated by str_split_begin function
    * Must be garbage collected by str_split_end function
    */
  int * splitStringsLengths;

  /* Number of strings split by delimiter contained in splitStrings
  * Provided by str_split_begin function
  */
  int splitStringsCount;
};
#define str_split_infop struct str_split_info *

/* Split a string by a delimiting string
* 
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
  info->splitStringsCount = 0;

  int sourceLength = strlen(info->source);
  int sourceOffset = 0;
  char sourceChar;

  int delimiterLength = strlen(info->delimiter);
  int delimiterOffset = 0;
  char delimiterChar;

  //first pass, simply count occurances so we can allocate only once
  for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
    sourceChar = info->source[sourceOffset];
    delimiterChar = info->delimiter[delimiterOffset];

    if (sourceChar == delimiterChar) {
      delimiterOffset++;

      if (delimiterOffset >= delimiterLength) {
        delimiterOffset = 0;
        //increment count
        info->splitStringsCount ++;
      }
    } else {
      delimiterOffset = 0;
    }
  }
  info->splitStringsCount++;

  //allocate arrays since we know the count
  //this one is an array of strings, which are each char arrays
  info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
  //this one is an array of ints
  info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);

  int stringBegin = 0;
  int stringEnd = 0;
  int splitIndex = 0;
  int splitLength = 0;

  //second pass, fill the arrays
  for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
    sourceChar = info->source[sourceOffset];
    delimiterChar = info->delimiter[delimiterOffset];

    if (sourceChar == delimiterChar) {
      delimiterOffset++;

      //if we've reached the end of the delimiter
      if (delimiterOffset >= delimiterLength) {

        //don't worry about delimiter trailing null, strlen doesn't count those
        stringEnd = sourceOffset - delimiterLength;
        
        //char count of substring we want to split
        splitLength = stringEnd - stringBegin + 1;

        //allocate for our substring split
        info->splitStrings[splitIndex] = (str) malloc(
          //+1 for trailing null for c-string
          sizeof(char) * splitLength + 1
        );

        //copy substring from source into splitStrings array
        memcpy(
          info->splitStrings[splitIndex],
          info->source + stringBegin,
          splitLength
        );
        //explicitly set the last char of this split to a NULL just for fun
        info->splitStrings[splitIndex][splitLength] = 0x00;

        //conveniently put the substring split size for the
        //user of str_split_begin :)
        info->splitStringsLengths[splitIndex] = splitLength;

        //move to next split index
        splitIndex ++;

        //reset delimiter offset so we look for new occurances of it
        delimiterOffset = 0;

        //next substring split should occur after the current delimiter
        stringBegin = sourceOffset+1;
      }
    } else {
      //reset delimiter offset so we look for new occurances of it
      delimiterOffset = 0;
    }
  }

  //handle edge case of last substring after last delimiter
  if (stringEnd != stringBegin) {
    stringEnd = sourceLength-1;

    splitLength = stringEnd - stringBegin + 1;

    //allocate for our substring split
    info->splitStrings[splitIndex] = (str) malloc(
      //+1 for trailing null for c-string
      sizeof(char) * splitLength + 1
    );

    //copy substring from source into splitStrings array
    memcpy(
      info->splitStrings[splitIndex],
      info->source + stringBegin,
      splitLength
    );
    
  }
}
int str_split_count (str_split_infop info) {
  return info->splitStringsCount;
}

void str_split_get (str_split_infop info, str * out) {
  for (int i=0; i < info->splitStringsCount; i++) {
    strcpy(out[i], info->splitStrings[i]);
  }
}

void str_split_end (str_split_infop info) {
  if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
    //free each string allocated
    for (int i=0; i < info->splitStringsCount; i++) {
      free(info->splitStrings[i]);
    }
    //free string array pointer
    free (info->splitStrings);

    //free string lengths array pointer
    free(info->splitStringsLengths);

    info->splitStringsCount = 0;
  }
}

void str_split_test () {
  char * source = "hello world this is a test";
  str delimiter = " ";

  struct str_split_info info;
  
  info.source = source;
  info.delimiter = delimiter;

  str_split_begin (&info);

  //iterate thru split substrings
  //NOTE: removed/memory cleanup after str_split_end
  for (int i=0; i<info.splitStringsCount; i++) {
    // info.splitStrings[i];
  }

  str_split_end(&info);
}

#endif

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM