简体   繁体   中英

How to separate a string into an array of the unique characters/ strings

Basically I want to know if it is possible (if so how) to read a string from left to right and greedily terminate and append once that a new string is found. Eg.

"ABCABCABCABC" would give {"A" "B" "C" "AB" "CA" "BC" "ABC"}

I've been trying all day and all I end up with is broken code and crashing programs.

This is what I have that isn't working. The array is defined as *a[linelen]

for(i =0; i < linelen ;i++)
{
    j=0;
    k=0; 
    tempstr[j] = input[i]; // move character from input to tempstring 
        for(k=0; k< array_size; k++) //search through array
        {
            tempstr[j] = input[i];
            if(*a != tempstr)//(strcmp(a,tempstr)) != 0) // if str not in array
            {
                printf("%s\n", a[0]); //debug
                a[array_size] = tempstr;
                //strcpy(a[array_size], tempstr); //copy str into array
                array_size++;
                memset(tempstr,0,linelen-i); // reset tempstr to empty
                j=0;

            } 
            if( *a == tempstr)//(strcmp(a[array_size],tempstr)) == 0)
            {
                j++;
                tempstr[j] = input[i+1];
                if(i != linelen -1) // otherwise if tempstr already in array
                {
                    printf("%s\n",a[0]); //debug
                    j++;
                    tempstr[j] = input[i+1];
                }
                else if (i == linelen -1) // if it is the last letter
                {
                    a[array_size] = tempstr;
                    //strcpy(a[array_size], tempstr); // add to array
                    break;
                }

            }
        }

}

Here's one that uses a simple character array to store the "seen" strings:

#include <stdio.h>

#if 0
#define dbg(_fmt...)        printf(_fmt)
#else
#define dbg(_fmt...)        /**/
#endif

// NOTE: could be char * and realloc if necessary
char seen[5000];

// find -- find old string
// RETURNS: 1=found, 0=no match
int
find(char *str)
{
    char *lhs;
    char *rhs;
    int foundflg;

    dbg("find: str='%s'\n",str);

    rhs = str;
    lhs = seen;
    dbg("find: lhs='%s'\n",seen);

    foundflg = 0;
    for (;  lhs < str;  ++lhs, ++rhs) {
        dbg("find: TRY lhs='%s' rhs='%s'\n",lhs,rhs);

        if (*lhs != *rhs) {
            dbg("find: SKIP\n");
            for (;  *lhs != 0;  ++lhs);
            rhs = str - 1;
            continue;
        }

        if ((*lhs == 0) && (*rhs == 0)) {
            dbg("find: MATCH\n");
            foundflg = 1;
            break;
        }

        if (*rhs == 0)
            break;
    }

    return foundflg;
}

void
sepstr(const char *inp)
{
    int chr;
    char *lhs;
    char *rhs;
    int finflg;

    lhs = seen;
    rhs = seen;
    finflg = 0;

    for (chr = *inp;  chr != 0;  chr = *++inp) {
        *rhs++ = chr;
        *rhs = 0;

        if (find(lhs)) {
            finflg = 1;
            continue;
        }

        printf("%s\n",lhs);
        lhs = ++rhs;
        finflg = 0;
    }

    if (finflg)
        printf("%s\n",lhs);
}

int
main(int argc,char **argv)
{

#if 1
    sepstr("ABCABCABCABC");
#else
    sepstr("ABCABCABCABCABC");
#endif
}

Here's a second way to do it:

#include <stdio.h>

char out[500];

#ifdef BIG
#define SEEN 256
#else
#define SEEN (26 + 1)
#endif

char seen[SEEN][SEEN];

void
sepstr(const char *inp)
{
    int chr;
    char *prv;
    char *rhs;

    prv = seen[0];

    rhs = out;
    for (chr = *inp;  chr != 0;  chr = *++inp) {
        *rhs++ = chr;

#ifndef BIG
        chr = (chr - 'A') + 1;
#endif

        if (prv[chr]) {
            prv = seen[chr];
            continue;
        }

        *rhs = 0;
        printf("%s\n",out);

        prv[chr] = 1;
        rhs = out;
        prv = seen[0];
    }

    if (rhs > out) {
        *rhs = 0;
        printf("%s\n",out);
    }
}

int
main(void)
{

#if 1
    sepstr("ABCABCABCABC");
#else
    sepstr("ABCABCABCABCABC");
#endif

    return 0;
}

Here are some benchmarks for everybody's program (time in ns and printf nop'ed):

first      minimum author
         527          137 craig1 -- original -- uses single seen char array
         146           39 craig2 -- modified -- uses 2D seen table
       45234        45234 felix1 -- original -- may only be executed once
       40460          656 felix2 -- uses fixed input
          24           18 machine1 -- original -- uses buffer[20][20] on stack
         908          417 machine2 -- modified -- uses global buffer[20][20]
       43089         1120 milevyo1 -- original
       42719          711 milevyo2 -- parseString tmp is stack buffer no malloc
        7957          429 milevyo3 -- NewNode uses fixed pool no malloc
        7457          380 milevyo4 -- removed linked list

Yes, it is possible.

You just have to keep track of the different string occurrences. The easiest way is to use a Set.

Edit: see here for how to implement a Set in C: How to implement a Set data structure

Edit2: you can find an implementation here (I did not test it): HashSet.c

Here,this should do :

#include <stdio.h>
#include <string.h>

int main(void)
{
    char str[] = "ABCABCABCABC";

    //length of str
    size_t len = strlen(str);

    //buffer to hold extracted strings
    char buffer[20][20];

    //i : 1st buffer index , j : 2nd buffer index , n : variable used in the loop
    size_t i = 1 , j = 0 , n = 0 ;

    //store str[0] and '\0' to form a string : buffer[0]
    buffer[0][0] = str[0];
    buffer[0][1] = '\0';

    //has the string been found ?
    bool found = false;

    //n should start by 1 since we stored str[0] int buffer already
    for( n = 1 ; n < len ; n++ )
    {
        //store str[n] in buffer , increment j , and store '\0' to make a string
        buffer[i][j] = str[n];
        j++;
        buffer[i][j] = '\0';

        //this loop check if the string stored is found in the entire buffer.
        for( int x = 0 ; x < i ; x++ )
        {
            if( strcmp(buffer[i],buffer[x]) == 0 )
            {
                found = true;
            }
        }

        //if the string has not been found,increment i,to make a new string in the next iteration
        if( found == false)
        {
            i++;
            j = 0;
        }
        //reset the bool value
        found = false;
    }

    //print the strings stored in buffer.
    for( int x = 0 ; x < i ; x++ )
    {
        printf("%s\n",buffer[x]);
    }
}

Here's a fully dynamic solution in C99, but it's still draft code (not checking for out of memory conditions at all) and maybe quite inefficient (doesn't use hashing for example):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* a string builder */
typedef struct sb
{
    size_t capacity;
    size_t len;
    char str[];
} sb;

/* a container of substrings, represented by string builders */
typedef struct strcontainer
{
    size_t capacity;
    size_t len;
    sb **substrings;
} strcontainer;

/* global maximum length of substrings seen so far */
static size_t maxlen;

/* container instances */
static strcontainer *containers;

/* initialize a new container */
static void strcontainer_init(strcontainer *self)
{
    self->capacity = 16;
    self->len = 0;
    self->substrings = malloc(16 * sizeof(sb *));
}

/* create a new string builder */
static sb *sb_create(void)
{
    sb *self = malloc(sizeof(sb) + 16);
    self->capacity = 16;
    self->len = 0;
    self->str[0] = 0;
    return self;
}

/* append a character to a string builder */
static sb *sb_append(sb *self, int c)
{
    self->str[self->len++] = (char) c;
    if (self->len == self->capacity)
    {
        self->capacity *= 2;
        self = realloc(self, sizeof(sb) + self->capacity);
    }
    self->str[self->len] = 0;
    return self;
}

/* get plain C string from a string builder */
static const char *sb_str(const sb *self)
{
    return &(self->str[0]);
}

/* check whether a substring with the contents of the given string builder is
 * already present and increase maximum length and count of containers if
 * necessary */
static int sb_ispresent(const sb *self)
{
    if (self->len > maxlen)
    {
        size_t oldlen = maxlen + 1;
        maxlen = self->len;
        containers = realloc(containers,
                (maxlen + 1) * sizeof(strcontainer));
        for (; oldlen <= maxlen; ++oldlen)
        {
            strcontainer_init(containers + oldlen);
        }
        return 0;
    }

    strcontainer *container = containers + self->len;

    for (size_t i = 0; i < container->len; ++i)
    {
        if (!strcmp(sb_str(self), sb_str(container->substrings[i])))
        {
            return 1;
        }
    }
    return 0;
}

/* check whether container has space left and if not, expand it */
static void strcontainer_checkexpand(strcontainer *self)
{
    if (self->len == self->capacity)
    {
        self->capacity *= 2;
        self->substrings = realloc(self->substrings,
                self->capacity * sizeof(sb *));
    }
}

/* insert a string builder as new substring in a container */
static void strcontainer_insert(strcontainer *self, sb *str)
{
    strcontainer_checkexpand(self);
    self->substrings[self->len++] = str;
}

/* insert this string builder instance in the appropriate containers */
static void sb_insert(sb *self)
{
    strcontainer_insert(containers, self);
    strcontainer_insert(containers + self->len, self);
}

int main(void)
{
    int c;
    size_t i = 0;

    /* idea here: allocate a global container and one for each substring
     * length. start with a maximum length of 1, makes 2 containers */
    containers = malloc(2 * sizeof(strcontainer));
    strcontainer_init(containers);
    strcontainer_init(containers+1);
    maxlen = 1;

    /* string builder for the substring */
    sb *builder = 0;

    while ((c = getchar()) != EOF)
    {
        /* on newline, output what we have so far */
        if (c == '\n')
        {
            while (i < containers->len)
            {
                puts(sb_str(containers->substrings[i++]));
            }
            continue;
        }

        /* ignore carriage returns, maybe ignore some other characters
         * here too? */
        if (c == '\r') continue;

        /* append each character to the string builder */
        if (!builder) builder = sb_create();
        builder = sb_append(builder, c);

        /* check whether we have seen the string already after every append */
        if (!sb_ispresent(builder))
        {
            /*then insert and restart with a new string builder */
            sb_insert(builder);
            builder = 0;
        }
    }

    /* more output after EOF */
    while (i < containers->len)
    {
        puts(sb_str(containers->substrings[i++]));
    }

    /* if we still have a builder, there was some non-unique text left over
     * at the end of the input */
    if (builder)
    {
        fprintf(stderr, "Left over: `%s'\n", sb_str(builder));
    }

    /* might want to clean up on the heap with some free()s ...
     * not strictly necessary at end of program */

    return 0;
}

example:

> echo "ABCABCABCABCABC" | ./greadystring
A
B
C
AB
CA
BC
ABC
Left over: `ABC'

i am using linked list to avoid duplicate items. check the output at the end.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>


typedef struct NODE NODE;
struct NODE{
    char value[20];
    NODE *next;
};
NODE *head=NULL;
/*_________________________________________________
*/
NODE *FindNode(char *p){
    NODE *tmp=head;
    while(tmp){
        if(_stricmp(tmp->value,p)==0) break;
        tmp=tmp->next;
    }
    return tmp;
}
/*_________________________________________________
*/
NODE *NewNode(char *p){
    NODE *tmp=calloc(1,sizeof(NODE));
    if(tmp){
        strcpy(tmp->value,p);
    }
    return tmp;
}
/*_________________________________________________
*/

int AddNode(char *p){

    NODE * tmp=FindNode(p);
    if(!tmp){
        if((tmp=NewNode(p))){
            if(!head)
                head=tmp;
            else{
                NODE *_tmp=head;
                while(_tmp->next)_tmp=_tmp->next;
                _tmp->next=tmp;
            }
            return 1;
        }

    }
    return 0;
}
/*_________________________________________________
*/
void printNodes(void){
    NODE *tmp=head;
    printf("{");
    while(tmp){
        printf("\"%s\"",tmp->value);
       tmp=tmp->next;
        if(tmp)printf(",");
    }
    printf("}\n");
 }
/*_________________________________________________
*/
void deleteNodes(void){
    NODE *tmp=head;
    while(tmp){
       head=tmp->next;
        free(tmp);
        tmp=head;
    }
 }
/*_________________________________________________
*/
void parseString(char *buff){
    int  buffSize=  strlen(buff);
    if(!buffSize) return;

    char *tmp;
    char *ptr=buff;

    int j=1,n=0;

    for(ptr=buff;n<buffSize;ptr+=j){
        tmp=calloc(sizeof(char),j+1);
        strncpy(tmp,ptr,j);
        if(!*tmp){
            free(tmp);
            break;
        }

        if(!AddNode(tmp)){
            j++;
            ptr-=j;

        }else
            n+=j;
        free(tmp);
    }

    printf("%s\n",buff);
    printNodes();
    printf("\n");
    deleteNodes();

}
int main(void){
    parseString("ABCABCABCABC");
    parseString("ABCABCABCABCABCABCABCABC");
    return 0;
}

here is the output:

ABCABCABCABC
{"A","B","C","AB","CA","BC","ABC"}

ABCABCABCABCABCABCABCABC
{"A","B","C","AB","CA","BC","ABC","ABCA","BCAB","CABC"}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM