简体   繁体   中英

Split a string into token in C

struct string {
    size_t length;
    size_t allocated;
    char* data;
};
    string* strCreate(char* str) {...}
string* strSubstring(string* str, size_t pos, size_t len) {...}
char* strGet(string* str) {
    return str->data;
}
size_t findFrist(string* str, char what, size_t pos) {
    for(size_t i = pos; i < str->length; ++i) {
        if(str->data[i] == what)
            return i;
    }
    return -1;
}
string** strTokenizer(string* str) {
    string** res;
    res = malloc(sizeof(char*)*90); //*90 (token num)
    for(int i=0; i<90; i++) //i<90 (token num)
        res[i] = malloc(sizeof(char)*100); //*100 (a token lenght)
    size_t first = 0;
    size_t i = 0;
    while(first < str->length) {
        int second = findFrist(str,' ',first);
        if(second == - 1)
            second = str->length;
        string* token = strSubstring(str,first, second - first);
        if(*strGet(token) != ' ')
            res[i] = token;
        first = second + 1;
        ++i;
    }
    return res;
}
int main() {
    string* fe = strCreate("A string \ tof");
    string** r = strTokenizer(fe);
    for(int i = 0; i < 4; ++i) {
        printf("%s",strGet(r[i]));
    }
    return 0;
}

I want to create a string tokenizer. When I want to print the string in the main function it not print the \ . The other thing is that how to allocate string** res in proper way. When I allocate for the sizeof(char*) the only way that I see is loop through the string, but I was just wondering is this possible to allocate without going through the string 2 times in the tokenizer function.

I do not want to use strtok

typedef struct string string in the.h file

When compiling the code using GCC 10.2.0 with options:

gcc -O3 -g -std=c11 -Wall -Wextra -Werror …

I get the message:

bs83.c:85:44: error: unknown escape sequence: '\040' [-Werror]
   85 |     string *fe = strCreate("A string \ tof");

As I noted in the comments, the string "A string \ tof" is malformed — your compiler should be warning you about it. If you want a backslash in a string, you write (for example), "A string \\ tof" . Backslash-space has no defined meaning; it is probably being interpreted as a single space. Maybe you should print the argument to strCreate() to validate this.

You also need typedef struct string string; near the top of the code for it to compile with a C compiler (and the headers).

Here is a fixed version of your code — because of the compilation options I use, the functions need to be either static or declared before being defined; I make them static since the only reason to make them non- static (IMO) is if they're accessed from another source file, and then they'd be declared in a header. That probably applies to your code — there is mention of a header.

Here's fixed code that works:

/* SO 6537-9584 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct string string;

struct string
{
    size_t length;
    size_t allocated;
    char *data;
};

static
string *strCreate(char *str)
{
    string *s = malloc(sizeof(string));
    s->length = strlen(str);
    s->allocated = s->length * 2;
    s->data = (char *)malloc(s->allocated + 1);
    memcpy(s->data, str, s->length);
    s->data[s->length] = '\0';
    return s;
}

static
string *strSubstring(string *str, size_t pos, size_t len)
{
    string *s = malloc(sizeof(string));
    s->length = len;
    s->allocated = s->length * 2;
    s->data = (char *)malloc(s->allocated + 1);
    memcpy(s->data, &str->data[pos], s->length);
    s->data[s->length] = '\0';
    return s;
}

static
char *strGet(string *str)
{
    return str->data;
}

static
size_t findFirst(string *str, char what, size_t pos)
{
    for (size_t i = pos; i < str->length; ++i)
    {
        if (str->data[i] == what)
            return i;
    }
    return -1;
}

static
string **strTokenizer(string *str)
{
    string **res;
    res = malloc(sizeof(char *) * 90); // *90 (token num)
    for (int i = 0; i < 90; i++) // i<90 (token num)
        res[i] = malloc(sizeof(char) * 100); // *100 (a token length)
    size_t first = 0;
    size_t i = 0;
    while (first < str->length)
    {
        int second = findFirst(str, ' ', first);
        if (second == -1)
            second = str->length;
        string *token = strSubstring(str, first, second - first);
        if (*strGet(token) != ' ')
            res[i] = token;
        first = second + 1;
        ++i;
    }
    return res;
}

int main(void)
{
    string *fe = strCreate("A string \\ tof");
    string **r = strTokenizer(fe);
    for (int i = 0; i < 4; ++i)
    {
        printf("[[%s]]\n", strGet(r[i]));
    }
    return 0;
}

The output is:

[[A]]
[[string]]
[[\]]
[[tof]]

Note the use of [[ and ]] around the printed strings (and the newlines). It makes it easier to identify trailing blanks and embedded carriage return characters and various other mishaps.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM