I am trying to solve this challenge: https://www.hackerrank.com/challenges/structuring-the-document/problem
Basically I have been given a locked stub of code with structs in it and I am supposed to parse a given text. This is an abridged version of my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#define MAX_CHARACTERS 1005
#define MAX_PARAGRAPHS 5
#include <ctype.h>
struct word {
char* data;
};
struct sentence {
struct word* data;
int word_count;//denotes number of words in a sentence
};
struct paragraph {
struct sentence* data ;
int sentence_count;//denotes number of sentences in a paragraph
};
struct document {
struct paragraph* data;
int paragraph_count;//denotes number of paragraphs in a document
};
struct document get_document(char* text) {
int spaces = 0, periods = 0, newlines = 0;
for(int i = 0; i < strlen(text); i++)
if(text[i] == ' ')
spaces++;
else if(text[i] == '.')
periods++;
else if(text[i] == '\n')
newlines++;
struct document doc;
doc.paragraph_count = newlines + 1;
doc.data = malloc((newlines + 1) * sizeof(struct paragraph));
struct paragraph para[doc.paragraph_count];
for(int i = 0; i < doc.paragraph_count; i++) {
para[i].sentence_count = periods + 1;
para[i].data = malloc((periods + 1) * sizeof(struct sentence));
}
struct sentence sen[para[0].sentence_count];
for(int i = 0; i < para[0].sentence_count; i++) {
sen[i].word_count = spaces + 1;
sen[i].data = malloc((spaces + 1) * sizeof(struct word));
}
struct word word[spaces + periods + 1];
int start = 0, k = 0, wordsub = 0, sensub = 0, parasub = 0, docsub = 0, wordno = 0, parano = 0;
for(int i = 0; i < strlen(text); i++) {
if(text[i] == ' ' || text[i] == '.') {
word[wordsub].data = malloc((i - start) * sizeof(char) + 1);
for(int j = start; j < i; j++)
word[wordsub].data[k++] = text[j];
word[wordsub].data[k++] = '\0';
k = 0;
if(i < strlen(text) - 1 && text[i + 1] == '\n')
start = i + 2;
else
start = i + 1;
if(text[i] == ' ') {
sen[sensub].data[wordno++] = word[wordsub++]; //wordno can be 0 or 1
}
if(i != strlen(text) && isalpha(text[i + 1]) && text[i] == '.') {
sen[sensub].data[wordno++] = word[wordsub++];
wordno = 0;
para[parasub].data[parano++] = sen[sensub++];
}
if((i != strlen(text) && text[i + 1] == '\n') || i + 1 == strlen(text)) {
sen[sensub++].data[wordno++] = word[wordsub];
wordno = 0;
parano = 0;
para[parasub].data[parano++] = sen[sensub];
doc.data[docsub++] = para[parasub++];
}
}
}
printf("%s\n", para[0].data[0].data[0].data);// should print "hello"
return doc;
}
int main() {
struct document doc;
char * text = "hello world.\nhi.bye.\nwow.";
doc = get_document(text);
printf("%s\n", doc.data[0].data[0].data[0].data);//should also print "hello"
}
The problem is the print statements are not printing "hello". Also if I change the indices in the print statements I get a segmentation error.
Here:
word[wordsub].data[k++] = text[j];
you are accessing data member out of allocated memory.
The problem statement specifies that there are never two terminators after a word. There should also be one word at least.
So, the test phrase
"hello world.\nhi.bye.\nwow."
does not fit, but
"hello world\nhi.bye\nwow"
fits and you will have "hello" printed.
Besides, your algorithm is very complex while the code could be simpler. It was fun to try and I did it.
First, let's use some typedef
to write less text!
typedef struct word {
char* data;
} W;
typedef struct sentence {
W* data;
int word_count;//denotes number of words in a sentence
} S;
typedef struct paragraph {
S* data ;
int sentence_count;//denotes number of sentences in a paragraph
} P;
typedef struct document {
P* data;
int paragraph_count;//denotes number of paragraphs in a document
} DOC;
Then the function itself. The logic is simple, do all of the following for each char of text
in sequence
' '
, '.'
or '\n'
) record the word'.'
or '\n'
) record the sentence'\n'
) record a paragraphThe end of the string counts as the end of a paragraph.
Code
struct document get_document(char* text) {
DOC doc = { NULL, 0 }; // you're the doc, doc
P parr = { NULL, 0 };
S sarr = { NULL, 0 };
int wpos=0;
for(int i=0, l=strlen(text) ; i<=l ; i++) { // <= length! (to deal with \0)
char c = text[i];
if ( ! c) c = '\n'; // End of string simulates end of paragraph
if (c == '\n' || c == '.' || c == ' ') {
// End of word, add it to sentence
W word;
word.data = malloc(i - wpos + 1); // +1 for '\0'
strncpy(word.data, text + wpos, i - wpos); // Copy only the word
word.data[i - wpos] = 0; // 0 terminate it
sarr.data = realloc(sarr.data, sizeof(W) * (sarr.word_count+1));
sarr.data[ sarr.word_count++ ] = word;
wpos = i+1;
if (c == '\n' || c == '.') {
// End of sentence, add it to paragraph
parr.data = realloc(parr.data, sizeof(S) * (parr.sentence_count+1));
parr.data[ parr.sentence_count++ ] = sarr;
sarr.data = NULL; // clear sentences
sarr.word_count = 0;
}
if (c == '\n') {
// End of paragraph, add it to doc
doc.data = realloc(doc.data, sizeof(P) * (doc.paragraph_count+1));
doc.data[ doc.paragraph_count++ ] = parr;
parr.data = NULL; // clear paragraphs
parr.sentence_count = 0;
}
}
}
return doc;
}
Finally, to see if that's working, print all members (using a compliant text!)
int main(int argc, char **argv) {
DOC doc;
char * text = "hello world\nhi.bye\nwow";
doc = get_document(text);
for(int i=0 ; i<doc.paragraph_count ; i++) {
printf("Para %d / %d\n", i, doc.paragraph_count-1);
P para = doc.data[i];
for(int j=0 ; j<para.sentence_count ; j++) {
printf("Sent %d / %d\n", j, para.sentence_count-1);
S sent = para.data[j];
for(int k=0 ; k<sent.word_count ; k++) {
W word = sent.data[k];
printf("Word %d / %d: %s\n", k, sent.word_count-1, word.data);
}
}
}
return 0;
}
We could add a bit of code to avoid the processing of two separators (like a trailing '\n'
, or '.'
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.