簡體   English   中英

C-在不限長度的行中讀取有限長度的單詞

[英]C - Reading limited length words in unlimited length lines

我想從文件中讀取單詞,並知道何時開始新的一行。

我知道每行可以有三個,四個或零個單詞,並且單詞不能超過一定長度。 但是帶空格的行長度沒有限制,因此不可能只讀取一行到字符串,進行解析並繼續。 我想知道在閱讀的每一行中是否有三個或四個單詞。

目前,我使用fscanf和一些特定於問題的內部邏輯來確定我讀取的第四個單詞是換行還是上一行中的第四個單詞。 但是這種方法很脆弱,很容易損壞。

我猜我可以逐字符讀取char,忽略空格並查找'\\ n'。 有沒有更優雅的方式?

謝謝

編輯:我僅限於使用C99和標准庫。

這是一些與您的要求緊密相關的代碼。 有兩個主要區別:

  1. 它不相信用戶知道他們要提供什么,因為數據必須遵守某些規則,因此它假定用戶將濫用這些規則。
  2. 因此,它將記錄在每一行上找到的所有單詞,並記錄完整長度的單詞,並因此使用動態內存分配。

在我發布之前,它已經通過了一些相當嚴格的測試。 您可以使用make UFLAGS=-DTEST進行編譯,以獲取更短的行片段(默認為64字節vs 4096),這也為您提供了額外的診斷輸出。 我使用6而不是64 MAX_LINE_LEN進行了很多測試-這對於調試單詞在一行的多個片段上連續出現的問題非常MAX_LINE_LEN

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { MAX_WORD_CNT = 8 };

#ifdef TEST
static int debug = 1;
enum { MAX_LINE_LEN = 64 };
#else
static int debug = 0;
enum { MAX_LINE_LEN = 4096 };
#endif /* TEST */

typedef struct Word
{
    size_t length;
    char  *word;
} Word;

typedef struct WordList
{
    size_t  num_words;
    size_t  max_words;
    Word   *words;
} WordList;

typedef struct LineControl
{
    size_t   line_length;
    bool     part_word;
    size_t   part_len;
    WordList list;
} LineControl;

static void init_wordlist(WordList *list)
{
    list->num_words = 0;
    list->max_words = 0;
    list->words = 0;
}

static void free_wordlist(WordList *list)
{
    assert(list != 0);
    for (size_t i = 0; i < list->num_words; i++)
        free(list->words[i].word);
    free(list->words);
    init_wordlist(list);
}

static void extend_word(const char *extn, size_t ext_len, Word *word)
{
    if (debug)
        printf("old (%zu) = [%s]; extra (%zu) = [%.*s]\n", word->length, word->word,
                ext_len, (int)ext_len, extn);
    size_t space = word->length + ext_len + 1;
    char *new_space = realloc(word->word, space);
    if (new_space == 0)
    {
        fprintf(stderr, "failed to reallocate %zu bytes of memory\n", space);
        exit(EXIT_FAILURE);
    }
    word->word = new_space;
    memmove(word->word + word->length, extn, ext_len);
    word->length += ext_len;
    word->word[word->length] = '\0';
    if (debug)
        printf("new (%zu) = [%s]\n", word->length, word->word);
    }

static void addword_wordlist(const char *word, size_t word_len, WordList *list)
{
    if (list->num_words >= list->max_words)
    {
        assert(list->num_words == list->max_words);
        size_t new_max = list->max_words * 2 + 2;
        Word *new_words = realloc(list->words, new_max * sizeof(*new_words));
        if (new_words == 0)
        {
            fprintf(stderr, "failed to allocate %zu bytes of memory\n", new_max * sizeof(*new_words));
            exit(EXIT_FAILURE);
        }
        list->max_words = new_max;
        list->words = new_words;
    }
    list->words[list->num_words].word = malloc(word_len + 1);
    if (list->words[list->num_words].word == 0)
    {
        fprintf(stderr, "failed to allocate %zu bytes of memory\n", word_len + 1);
        exit(EXIT_FAILURE);
    }
    Word *wp = &list->words[list->num_words];
    wp->length = word_len;
    memmove(wp->word, word, word_len);
    wp->word[word_len] = '\0';
    list->num_words++;
}

static void init_linectrl(LineControl *ctrl)
{
    ctrl->line_length = 0;
    ctrl->part_word = false;
    ctrl->part_len = 0;
    init_wordlist(&ctrl->list);
}

static int parse_fragment(const char *line, LineControl *ctrl)
{
    char   whisp[] = " \t";
    size_t offset = 0;
    bool   got_eol = false;

    /* The only newline in the string is at the end, if it is there at all */
    assert(strchr(line, '\n') == strrchr(line, '\n'));
    assert(strchr(line, '\n') == 0 || *(strchr(line, '\n') + 1) == '\0');
    if (debug && ctrl->part_word)
    {
        assert(ctrl->list.num_words > 0);
        printf("Dealing with partial word on entry (%zu: [%s])\n",
               ctrl->part_len, ctrl->list.words[ctrl->list.num_words - 1].word);
    }

    size_t o_nonsp = 0;
    while (line[offset] != '\0')
    {
        size_t n_whisp = strspn(line + offset, whisp);
        size_t n_nonsp = strcspn(line + offset + n_whisp, whisp);
        if (debug)
            printf("offset %zu, whisp %zu, nonsp %zu\n", offset, n_whisp, n_nonsp);
        got_eol = false;
        ctrl->line_length += n_whisp + n_nonsp;
        if (line[offset + n_whisp + n_nonsp - 1] == '\n')
        {
            assert(n_nonsp > 0);
            got_eol = true;
            n_nonsp--;
        }
        if (n_whisp + n_nonsp == 0)
        {
            o_nonsp = 0;
            break;
        }

        if (n_whisp != 0)
        {
            ctrl->part_word = false;
            ctrl->part_len = 0;
        }

        /* Add words to list if the list is not already full */
        if (n_nonsp > 0)
        {
            const char *word = line + offset + n_whisp;
            if (ctrl->part_word)
            {
                assert(ctrl->list.num_words > 0);
                extend_word(word, n_nonsp,
                            &ctrl->list.words[ctrl->list.num_words - 1]);
            }
            else
            {
                addword_wordlist(word, n_nonsp, &ctrl->list);
            }
        }

        offset += n_whisp + n_nonsp;
        if (line[offset] != '\0')
        {
            ctrl->part_word = false;
            ctrl->part_len = 0;
        }
        o_nonsp = n_nonsp;
        if (got_eol)
            break;
    }

    /* Partial word detection */
    if (o_nonsp > 0 && !got_eol)
    {
        ctrl->part_word = true;
        ctrl->part_len += o_nonsp;
    }
    else
    {
        ctrl->part_word = false;
        ctrl->part_len = 0;
    }

    /* If seen newline; line complete */
    /* If No newline; line incomplete */
    return !got_eol;
}

int main(void)
{
    char line[MAX_LINE_LEN];
    size_t lineno = 0;

    while (fgets(line, sizeof(line), stdin) != 0)
    {
        LineControl ctrl;
        init_linectrl(&ctrl);
        lineno++;
        if (debug)
            printf("Line %zu: (%zu) [[%s]]\n", lineno, strlen(line), line);

        int extra = 0;
        while (parse_fragment(line, &ctrl) != 0 &&
               fgets(line, sizeof(line), stdin) != 0)
        {
            if (debug)
                printf("Extra %d for line %zu: (%zu) [[%s]]\n",
                       ++extra, lineno, strlen(line), line);
        }

        WordList *list = &ctrl.list;
        printf("Line %zu: length %zu, words = %zu\n",
               lineno, ctrl.line_length, list->num_words);
        size_t num_words = list->num_words;
        if (num_words > MAX_WORD_CNT)
            num_words = MAX_WORD_CNT;
        for (size_t i = 0; i < num_words; i++)
        {
            printf("  %zu: (%zu) %s\n",
                   i + 1, list->words[i].length, list->words[i].word);
        }
        putchar('\n');
        free_wordlist(&ctrl.list);
    }

    return 0;
}

我有一個沒有動態內存分配的簡單版本,但是當一個單詞被分成一行的兩個片段時,它無法正常工作(因此,如果一行片段的大小為6(5個字符加空字節),並且最大長度例如,一個單詞的單詞數是16,那么代碼在組裝片段時就遇到了麻煩。因此,我采用了一種更簡單的方法-存儲所有單詞。從這個問題尚不清楚最大單詞數是多少。如果應反對0、3或4個單詞以外的任何東西,則可以使用這些數據進行投訴;如果代碼應反對長度超過某些長度(例如32個)的單詞,則該數據也可以用於進行這些抱怨。

較簡單的測試文件之一是test-data.1

    a b   
    a b      c         d                                                        

1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds
1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    
               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        
k
                                                apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                              apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                                  apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                                           

如其中的相同版本的數據所示,其中包含各種選項卡,其中選項卡顯示為\\t

    a b   
    a b      c         d                                                        
\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds
1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    
               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        
k
  \t\t \t \t\t\t \t \t \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t    \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper  \t \t \t \t\t\t\t \t \tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t           \t\t\t\t \t \t \t \t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t\t\t\t    \t \t \t \t      \t \t \t 

運行以下awk腳本可分析數據:

$ awk '{ printf "%3d %d [%s]\n", length($0) + 1, NF, $0 }' test-data.1
  1 0 []
  5 0 [    ]
 11 2 [    a b   ]
 81 4 [    a b      c         d                                                        ]
 20 0 [                                                     ]
 63 3 [1123xxsdfdsfsfdsfdssa          1234ddfxxyff            frrrdds]
103 4 [1123dfdffdfdxxxxxxxxxas                        1234ydfyyyzm   knsaaass      1234asdafxxfrrrfrrrsaa    ]
 82 4 [               1123werwetrretttrretertre       aaaa     bbbbbb      ccccc        ]
  2 1 [k]
494 4 [                                                 apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                              apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                      apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper                                           ]
$

該數據文件上程序的輸出為:

Line 1: length 1, words = 0

Line 2: length 5, words = 0

Line 3: length 11, words = 2
  1: (1) a
  2: (1) b

Line 4: length 81, words = 4
  1: (1) a
  2: (1) b
  3: (1) c
  4: (1) d

Line 5: length 20, words = 0

Line 6: length 63, words = 3
  1: (21) 1123xxsdfdsfsfdsfdssa
  2: (12) 1234ddfxxyff
  3: (7) frrrdds

Line 7: length 103, words = 4
  1: (23) 1123dfdffdfdxxxxxxxxxas
  2: (12) 1234ydfyyyzm
  3: (8) knsaaass
  4: (22) 1234asdafxxfrrrfrrrsaa

Line 8: length 82, words = 4
  1: (25) 1123werwetrretttrretertre
  2: (4) aaaa
  3: (6) bbbbbb
  4: (5) ccccc

Line 9: length 2, words = 1
  1: (1) k

Line 10: length 494, words = 4
  1: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  2: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  3: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
  4: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper

您可以看到awk腳本中的數據出現在輸出中。

此代碼是我提供SOQ (堆棧溢出問題)在GitHub存儲庫中的文件scan59.ctest-data.1test-data.2test-data.3/用戶/ jleffler / SOQ / src目錄/所以-5201-4002子目錄。 尤其是test-data.3文件,包含一行包含9955個字符和693個單詞的行,以及不那么嚴格的測試的其他行。

使用GCC 8.2.0和Valgrind 3.14.0.GIT,代碼可以在運行macOS 10.13.6 High Sierra的Mac上運行並正常運行。 (盡管makefile規定了C11,但是此代碼中沒有C11特有的東西;它與C99完全兼容。它也可以使用make SFLAGS='-std=c99 -pedantic'干凈地編譯。)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM