C - Reading limited length words in unlimited length lines

Question

I would like to read words from a file and know when a new line starts.

I know there can be three, four or zero words per line and the words cannot be longer than a certain length. But the line length with spaces is not bounded, so it is not possible to just read a line to a string, parse and continue. I would like to know if there are three or four words in each line as I read it.

Currently I use fscanf and some problem-specific internal logic to decide if the fourth word I read is in a new line or the fourth in the previous line. But this way is fragile and easily broken.

I guess I could read char by char, ignore spaces and look for '\\n'. Is there a more elegant way?

Thank you

EDIT: I am limited to using C99 and standard libraries.

Answer 1

Here is some code that does a job closely related to what you request. There are a couple of major differences:

It doesn't believe that the user knows what they're supplying as data has to obey certain rules, so it assumes that the user will abuse those rules.
Consequently, it records all words found on each line, recording the words at full length, and therefore using dynamic memory allocation.

It's been through some fairly acid testing before I posted it. You compile with make UFLAGS=-DTEST to get shorter fragments of lines (64 bytes vs 4096 by default), and that also gives you extra diagnostic output. I did a lot of testing with MAX_LINE_LEN at 6 instead of 64 — it was good for debugging problems with words continued over multiple fragments of a line.

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { MAX_WORD_CNT = 8 };

#ifdef TEST
static int debug = 1;
enum { MAX_LINE_LEN = 64 };
#else
static int debug = 0;
enum { MAX_LINE_LEN = 4096 };
#endif /* TEST */

typedef struct Word
{
    size_t length;
    char  *word;
} Word;

typedef struct WordList
{
    size_t  num_words;
    size_t  max_words;
    Word   *words;
} WordList;

typedef struct LineControl
{
    size_t   line_length;
    bool     part_word;
    size_t   part_len;
    WordList list;
} LineControl;

static void init_wordlist(WordList *list)
{
    list->num_words = 0;
    list->max_words = 0;
    list->words = 0;
}

static void free_wordlist(WordList *list)
{
    assert(list != 0);
    for (size_t i = 0; i < list->num_words; i++)
        free(list->words[i].word);
    free(list->words);
    init_wordlist(list);
}

static void extend_word(const char *extn, size_t ext_len, Word *word)
{
    if (debug)
        printf("old (%zu) = [%s]; extra (%zu) = [%.*s]\n", word->length, word->word,
                ext_len, (int)ext_len, extn);
    size_t space = word->length + ext_len + 1;
    char *new_space = realloc(word->word, space);
    if (new_space == 0)
    {
        fprintf(stderr, "failed to reallocate %zu bytes of memory\n", space);
        exit(EXIT_FAILURE);
    }
    word->word = new_space;
    memmove(word->word + word->length, extn, ext_len);
    word->length += ext_len;
    word->word[word->length] = '\0';
    if (debug)
        printf("new (%zu) = [%s]\n", word->length, word->word);
    }

static void addword_wordlist(const char *word, size_t word_len, WordList *list)
{
    if (list->num_words >= list->max_words)
    {
        assert(list->num_words == list->max_words);
        size_t new_max = list->max_words * 2 + 2;
        Word *new_words = realloc(list->words, new_max * sizeof(*new_words));
        if (new_words == 0)
        {
            fprintf(stderr, "failed to allocate %zu bytes of memory\n", new_max * sizeof(*new_words));
            exit(EXIT_FAILURE);
        }
        list->max_words = new_max;
        list->words = new_words;
    }
    list->words[list->num_words].word = malloc(word_len + 1);
    if (list->words[list->num_words].word == 0)
    {
        fprintf(stderr, "failed to allocate %zu bytes of memory\n", word_len + 1);
        exit(EXIT_FAILURE);
    }
    Word *wp = &list->words[list->num_words];
    wp->length = word_len;
    memmove(wp->word, word, word_len);
    wp->word[word_len] = '\0';
    list->num_words++;
}

static void init_linectrl(LineControl *ctrl)
{
    ctrl->line_length = 0;
    ctrl->part_word = false;
    ctrl->part_len = 0;
    init_wordlist(&ctrl->list);
}

static int parse_fragment(const char *line, LineControl *ctrl)
{
    char   whisp[] = " \t";
    size_t offset = 0;
    bool   got_eol = false;

    /* The only newline in the string is at the end, if it is there at all */
    assert(strchr(line, '\n') == strrchr(line, '\n'));
    assert(strchr(line, '\n') == 0 || *(strchr(line, '\n') + 1) == '\0');
    if (debug && ctrl->part_word)
    {
        assert(ctrl->list.num_words > 0);
        printf("Dealing with partial word on entry (%zu: [%s])\n",
               ctrl->part_len, ctrl->list.words[ctrl->list.num_words - 1].word);
    }

    size_t o_nonsp = 0;
    while (line[offset] != '\0')
    {
        size_t n_whisp = strspn(line + offset, whisp);
        size_t n_nonsp = strcspn(line + offset + n_whisp, whisp);
        if (debug)
            printf("offset %zu, whisp %zu, nonsp %zu\n", offset, n_whisp, n_nonsp);
        got_eol = false;
        ctrl->line_length += n_whisp + n_nonsp;
        if (line[offset + n_whisp + n_nonsp - 1] == '\n')
        {
            assert(n_nonsp > 0);
            got_eol = true;
            n_nonsp--;
        }
        if (n_whisp + n_nonsp == 0)
        {
            o_nonsp = 0;
            break;
        }

        if (n_whisp != 0)
        {
            ctrl->part_word = false;
            ctrl->part_len = 0;
        }

        /* Add words to list if the list is not already full */
        if (n_nonsp > 0)
        {
            const char *word = line + offset + n_whisp;
            if (ctrl->part_word)
            {
                assert(ctrl->list.num_words > 0);
                extend_word(word, n_nonsp,
                            &ctrl->list.words[ctrl->list.num_words - 1]);
            }
            else
            {
                addword_wordlist(word, n_nonsp, &ctrl->list);
            }
        }

        offset += n_whisp + n_nonsp;
        if (line[offset] != '\0')
        {
            ctrl->part_word = false;
            ctrl->part_len = 0;
        }
        o_nonsp = n_nonsp;
        if (got_eol)
            break;
    }

    /* Partial word detection */
    if (o_nonsp > 0 && !got_eol)
    {
        ctrl->part_word = true;
        ctrl->part_len += o_nonsp;
    }
    else
    {
        ctrl->part_word = false;
        ctrl->part_len = 0;
    }

    /* If seen newline; line complete */
    /* If No newline; line incomplete */
    return !got_eol;
}

int main(void)
{
    char line[MAX_LINE_LEN];
    size_t lineno = 0;

    while (fgets(line, sizeof(line), stdin) != 0)
    {
        LineControl ctrl;
        init_linectrl(&ctrl);
        lineno++;
        if (debug)
            printf("Line %zu: (%zu) [[%s]]\n", lineno, strlen(line), line);

        int extra = 0;
        while (parse_fragment(line, &ctrl) != 0 &&
               fgets(line, sizeof(line), stdin) != 0)
        {
            if (debug)
                printf("Extra %d for line %zu: (%zu) [[%s]]\n",
                       ++extra, lineno, strlen(line), line);
        }

        WordList *list = &ctrl.list;
        printf("Line %zu: length %zu, words = %zu\n",
               lineno, ctrl.line_length, list->num_words);
        size_t num_words = list->num_words;
        if (num_words > MAX_WORD_CNT)
            num_words = MAX_WORD_CNT;
        for (size_t i = 0; i < num_words; i++)
        {
            printf("  %zu: (%zu) %s\n",
                   i + 1, list->words[i].length, list->words[i].word);
        }
        putchar('\n');
        free_wordlist(&ctrl.list);
    }

    return 0;
}

I had a simpler version without the dynamic memory allocation but it didn't work properly when a word was split across two fragments of a line (so if the size of line fragment was 6 (5 characters plus null byte), and the maximum length of a word was 16, say, then the code ran into difficulties assembling the fragments. Consequently, I adopted a simpler approach — store all of every word. It isn't clear from the question what the maximum word sizes are. If the code should object to anything other than 0, 3 or 4 words, the data is available to make those complaints. If the code should object to words that are longer than some length such as 32, the data is available to make those complaints too.

One of the simpler test files is test-data.1 :

    a b   
    a b      c         d                                                        

1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds
1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    
               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        
k
                                                apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                              apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                                  apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper

That has all sorts of tabs in it, as demonstrated by this version of the same data, where tabs are shown as \\t :

    a b   
    a b      c         d                                                        
\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds
1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    
               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        
k
  \t\t \t \t\t\t \t \t \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t    \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper  \t \t \t \t\t\t\t \t \tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t           \t\t\t\t \t \t \t \t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t\t\t\t    \t \t \t \t      \t \t \t

Running this awk script analyzes the data:

$ awk '{ printf "%3d %d [%s]\n", length($0) + 1, NF, $0 }' test-data.1
  1 0 []
  5 0 [    ]
 11 2 [    a b   ]
 81 4 [    a b      c         d                                                        ]
 20 0 [                                                     ]
 63 3 [1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds]
103 4 [1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    ]
 82 4 [               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        ]
  2 1 [k]
494 4 [                                                 apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                              apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                           ]
$

The output from the program on that data file is:

Line 1: length 1, words = 0

Line 2: length 5, words = 0

Line 3: length 11, words = 2
  1: (1) a
  2: (1) b

Line 4: length 81, words = 4
  1: (1) a
  2: (1) b
  3: (1) c
  4: (1) d

Line 5: length 20, words = 0

Line 6: length 63, words = 3
  1: (21) 1123xxsdfdsfsfdsfdssa
  2: (12) 1234ddfxxyff
  3: (7) frrrdds

Line 7: length 103, words = 4
  1: (23) 1123dfdffdfdxxxxxxxxxas
  2: (12) 1234ydfyyyzm
  3: (8) knsaaass
  4: (22) 1234asdafxxfrrrfrrrsaa

Line 8: length 82, words = 4
  1: (25) 1123werwetrretttrretertre
  2: (4) aaaa
  3: (6) bbbbbb
  4: (5) ccccc

Line 9: length 2, words = 1
  1: (1) k

Line 10: length 494, words = 4
  1: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  2: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  3: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  4: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper

You can see the data from the awk script appearing in the output.

This code is available in my SOQ (Stack Overflow Questions) repository on GitHub as files scan59.c , test-data.1 , test-data.2 and test-data.3 in the /Users/jleffler/soq/src/so-5201-4002 sub-directory. The test-data.3 file, in particular, contains one line with 9955 characters, and 693 words — as well as other lines that are less stringent tests.

The code runs compiles and runs cleanly on a Mac running macOS 10.13.6 High Sierra, using GCC 8.2.0 and Valgrind 3.14.0.GIT. (Although the makefile stipulates C11, there is nothing in this code that is specific to C11; it is fully compatible with C99. It also compiles cleanly with make SFLAGS='-std=c99 -pedantic' .)

C - Reading limited length words in unlimited length lines

Question

1 answers

solution1
2 ACCPTED 2018-08-27 07:05:01

C - Reading limited length words in unlimited length lines

Question

1 answers

solution1 2 ACCPTED 2018-08-27 07:05:01

solution1
2 ACCPTED 2018-08-27 07:05:01