[英]C - Reading limited length words in unlimited length lines
我想從文件中讀取單詞,並知道何時開始新的一行。
我知道每行可以有三個,四個或零個單詞,並且單詞不能超過一定長度。 但是帶空格的行長度沒有限制,因此不可能只讀取一行到字符串,進行解析並繼續。 我想知道在閱讀的每一行中是否有三個或四個單詞。
目前,我使用fscanf和一些特定於問題的內部邏輯來確定我讀取的第四個單詞是換行還是上一行中的第四個單詞。 但是這種方法很脆弱,很容易損壞。
我猜我可以逐字符讀取char,忽略空格並查找'\\ n'。 有沒有更優雅的方式?
謝謝
編輯:我僅限於使用C99和標准庫。
這是一些與您的要求緊密相關的代碼。 有兩個主要區別:
在我發布之前,它已經通過了一些相當嚴格的測試。 您可以使用make UFLAGS=-DTEST
進行編譯,以獲取更短的行片段(默認為64字節vs 4096),這也為您提供了額外的診斷輸出。 我使用6
而不是64
MAX_LINE_LEN
進行了很多測試-這對於調試單詞在一行的多個片段上連續出現的問題非常MAX_LINE_LEN
。
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum { MAX_WORD_CNT = 8 };
#ifdef TEST
static int debug = 1;
enum { MAX_LINE_LEN = 64 };
#else
static int debug = 0;
enum { MAX_LINE_LEN = 4096 };
#endif /* TEST */
typedef struct Word
{
size_t length;
char *word;
} Word;
typedef struct WordList
{
size_t num_words;
size_t max_words;
Word *words;
} WordList;
typedef struct LineControl
{
size_t line_length;
bool part_word;
size_t part_len;
WordList list;
} LineControl;
static void init_wordlist(WordList *list)
{
list->num_words = 0;
list->max_words = 0;
list->words = 0;
}
static void free_wordlist(WordList *list)
{
assert(list != 0);
for (size_t i = 0; i < list->num_words; i++)
free(list->words[i].word);
free(list->words);
init_wordlist(list);
}
static void extend_word(const char *extn, size_t ext_len, Word *word)
{
if (debug)
printf("old (%zu) = [%s]; extra (%zu) = [%.*s]\n", word->length, word->word,
ext_len, (int)ext_len, extn);
size_t space = word->length + ext_len + 1;
char *new_space = realloc(word->word, space);
if (new_space == 0)
{
fprintf(stderr, "failed to reallocate %zu bytes of memory\n", space);
exit(EXIT_FAILURE);
}
word->word = new_space;
memmove(word->word + word->length, extn, ext_len);
word->length += ext_len;
word->word[word->length] = '\0';
if (debug)
printf("new (%zu) = [%s]\n", word->length, word->word);
}
static void addword_wordlist(const char *word, size_t word_len, WordList *list)
{
if (list->num_words >= list->max_words)
{
assert(list->num_words == list->max_words);
size_t new_max = list->max_words * 2 + 2;
Word *new_words = realloc(list->words, new_max * sizeof(*new_words));
if (new_words == 0)
{
fprintf(stderr, "failed to allocate %zu bytes of memory\n", new_max * sizeof(*new_words));
exit(EXIT_FAILURE);
}
list->max_words = new_max;
list->words = new_words;
}
list->words[list->num_words].word = malloc(word_len + 1);
if (list->words[list->num_words].word == 0)
{
fprintf(stderr, "failed to allocate %zu bytes of memory\n", word_len + 1);
exit(EXIT_FAILURE);
}
Word *wp = &list->words[list->num_words];
wp->length = word_len;
memmove(wp->word, word, word_len);
wp->word[word_len] = '\0';
list->num_words++;
}
static void init_linectrl(LineControl *ctrl)
{
ctrl->line_length = 0;
ctrl->part_word = false;
ctrl->part_len = 0;
init_wordlist(&ctrl->list);
}
static int parse_fragment(const char *line, LineControl *ctrl)
{
char whisp[] = " \t";
size_t offset = 0;
bool got_eol = false;
/* The only newline in the string is at the end, if it is there at all */
assert(strchr(line, '\n') == strrchr(line, '\n'));
assert(strchr(line, '\n') == 0 || *(strchr(line, '\n') + 1) == '\0');
if (debug && ctrl->part_word)
{
assert(ctrl->list.num_words > 0);
printf("Dealing with partial word on entry (%zu: [%s])\n",
ctrl->part_len, ctrl->list.words[ctrl->list.num_words - 1].word);
}
size_t o_nonsp = 0;
while (line[offset] != '\0')
{
size_t n_whisp = strspn(line + offset, whisp);
size_t n_nonsp = strcspn(line + offset + n_whisp, whisp);
if (debug)
printf("offset %zu, whisp %zu, nonsp %zu\n", offset, n_whisp, n_nonsp);
got_eol = false;
ctrl->line_length += n_whisp + n_nonsp;
if (line[offset + n_whisp + n_nonsp - 1] == '\n')
{
assert(n_nonsp > 0);
got_eol = true;
n_nonsp--;
}
if (n_whisp + n_nonsp == 0)
{
o_nonsp = 0;
break;
}
if (n_whisp != 0)
{
ctrl->part_word = false;
ctrl->part_len = 0;
}
/* Add words to list if the list is not already full */
if (n_nonsp > 0)
{
const char *word = line + offset + n_whisp;
if (ctrl->part_word)
{
assert(ctrl->list.num_words > 0);
extend_word(word, n_nonsp,
&ctrl->list.words[ctrl->list.num_words - 1]);
}
else
{
addword_wordlist(word, n_nonsp, &ctrl->list);
}
}
offset += n_whisp + n_nonsp;
if (line[offset] != '\0')
{
ctrl->part_word = false;
ctrl->part_len = 0;
}
o_nonsp = n_nonsp;
if (got_eol)
break;
}
/* Partial word detection */
if (o_nonsp > 0 && !got_eol)
{
ctrl->part_word = true;
ctrl->part_len += o_nonsp;
}
else
{
ctrl->part_word = false;
ctrl->part_len = 0;
}
/* If seen newline; line complete */
/* If No newline; line incomplete */
return !got_eol;
}
int main(void)
{
char line[MAX_LINE_LEN];
size_t lineno = 0;
while (fgets(line, sizeof(line), stdin) != 0)
{
LineControl ctrl;
init_linectrl(&ctrl);
lineno++;
if (debug)
printf("Line %zu: (%zu) [[%s]]\n", lineno, strlen(line), line);
int extra = 0;
while (parse_fragment(line, &ctrl) != 0 &&
fgets(line, sizeof(line), stdin) != 0)
{
if (debug)
printf("Extra %d for line %zu: (%zu) [[%s]]\n",
++extra, lineno, strlen(line), line);
}
WordList *list = &ctrl.list;
printf("Line %zu: length %zu, words = %zu\n",
lineno, ctrl.line_length, list->num_words);
size_t num_words = list->num_words;
if (num_words > MAX_WORD_CNT)
num_words = MAX_WORD_CNT;
for (size_t i = 0; i < num_words; i++)
{
printf(" %zu: (%zu) %s\n",
i + 1, list->words[i].length, list->words[i].word);
}
putchar('\n');
free_wordlist(&ctrl.list);
}
return 0;
}
我有一個沒有動態內存分配的簡單版本,但是當一個單詞被分成一行的兩個片段時,它無法正常工作(因此,如果一行片段的大小為6(5個字符加空字節),並且最大長度例如,一個單詞的單詞數是16,那么代碼在組裝片段時就遇到了麻煩。因此,我采用了一種更簡單的方法-存儲所有單詞。從這個問題尚不清楚最大單詞數是多少。如果應反對0、3或4個單詞以外的任何東西,則可以使用這些數據進行投訴;如果代碼應反對長度超過某些長度(例如32個)的單詞,則該數據也可以用於進行這些抱怨。
較簡單的測試文件之一是test-data.1
:
a b
a b c d
1123xxsdfdsfsfdsfdssa 1234ddfxxyff frrrdds
1123dfdffdfdxxxxxxxxxas 1234ydfyyyzm knsaaass 1234asdafxxfrrrfrrrsaa
1123werwetrretttrretertre aaaa bbbbbb ccccc
k
apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
如其中的相同版本的數據所示,其中包含各種選項卡,其中選項卡顯示為\\t
:
a b
a b c d
\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t
1123xxsdfdsfsfdsfdssa 1234ddfxxyff frrrdds
1123dfdffdfdxxxxxxxxxas 1234ydfyyyzm knsaaass 1234asdafxxfrrrfrrrsaa
1123werwetrretttrretertre aaaa bbbbbb ccccc
k
\t\t \t \t\t\t \t \t \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t \t\t\t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper \t \t \t \t\t\t\t \t \tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t \t\t\t\t \t \t \t \t\tapoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper\t\t\t\t\t\t \t \t \t \t \t \t \t
運行以下awk
腳本可分析數據:
$ awk '{ printf "%3d %d [%s]\n", length($0) + 1, NF, $0 }' test-data.1
1 0 []
5 0 [ ]
11 2 [ a b ]
81 4 [ a b c d ]
20 0 [ ]
63 3 [1123xxsdfdsfsfdsfdssa 1234ddfxxyff frrrdds]
103 4 [1123dfdffdfdxxxxxxxxxas 1234ydfyyyzm knsaaass 1234asdafxxfrrrfrrrsaa ]
82 4 [ 1123werwetrretttrretertre aaaa bbbbbb ccccc ]
2 1 [k]
494 4 [ apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper ]
$
該數據文件上程序的輸出為:
Line 1: length 1, words = 0
Line 2: length 5, words = 0
Line 3: length 11, words = 2
1: (1) a
2: (1) b
Line 4: length 81, words = 4
1: (1) a
2: (1) b
3: (1) c
4: (1) d
Line 5: length 20, words = 0
Line 6: length 63, words = 3
1: (21) 1123xxsdfdsfsfdsfdssa
2: (12) 1234ddfxxyff
3: (7) frrrdds
Line 7: length 103, words = 4
1: (23) 1123dfdffdfdxxxxxxxxxas
2: (12) 1234ydfyyyzm
3: (8) knsaaass
4: (22) 1234asdafxxfrrrfrrrsaa
Line 8: length 82, words = 4
1: (25) 1123werwetrretttrretertre
2: (4) aaaa
3: (6) bbbbbb
4: (5) ccccc
Line 9: length 2, words = 1
1: (1) k
Line 10: length 494, words = 4
1: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
2: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
3: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
4: (98) apoplectic-catastrophe-mongers-of-the-world-unite-for-you-have-nothing-to-lose-but-your-bad-temper
您可以看到awk
腳本中的數據出現在輸出中。
此代碼是我提供SOQ (堆棧溢出問題)在GitHub存儲庫中的文件scan59.c
, test-data.1
, test-data.2
和test-data.3
在/用戶/ jleffler / SOQ / src目錄/所以-5201-4002子目錄。 尤其是test-data.3
文件,包含一行包含9955個字符和693個單詞的行,以及不那么嚴格的測試的其他行。
使用GCC 8.2.0和Valgrind 3.14.0.GIT,代碼可以在運行macOS 10.13.6 High Sierra的Mac上運行並正常運行。 (盡管makefile
規定了C11,但是此代碼中沒有C11特有的東西;它與C99完全兼容。它也可以使用make SFLAGS='-std=c99 -pedantic'
干凈地編譯。)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.