Parsing simple HTML into tree

Question

I'd like to ask what is the best way to parse a simple html code into DOM Node tree like this one:

示例树

Here are some constraints I am facing:

HTML code will have only pair tags, no attributes and I've to ignore spaces
There can be Text between tags like <p> , <h1> , <a> etc.
I can't use libraries

I was thinking about regex, but never tried it so.. Any ideas?

Every node in the tree is this struct:

  typedef struct tag
  {
      struct tag* parent;
      struct tag* nextSibling;
      struct tag* previousSibling;
      struct tag* firstChild;
      struct tag* lastChild;     
      char* name;
      char* text;     
  }node;

Answer 1

I know it isin't in C, but that presentation might give you some input on how you could efficiently tackle the problem.

https://web.archive.org/web/20120115060003/http://cuddle.googlecode.com/hg/talk/lex.html#landing-slide

I also wrote a very simple parser example in JavaScript (again not in C, but hopefully you know JS as well) based on your initial requirements, meaning that it will not parse any attributes and do not handle self-closing tags and many other things that should be handled according to the HTML specs. It will produce a parse tree in this format:

{
    cn: [{
        tag: 'html',
        cn: [{
            tag: 'body',
            cn: [
                { tag: 'h1', cn: ['test'] },
                ' some text ',
                ...
            ]
        }] 
    }]
}

Here's the code and fiddle: http://jsfiddle.net/LUpyZ/3/

Note that white space is not ignored and will be captured in text nodes.

var html = '<html><body><h1>test</h1> some text <div> <p>text</p></div></body></html>';

var parseHTML = (function () {
    var nodesStack = [],
        i = 0,
        len = html.length,
        stateFn = parseText,
        parseTree = { cn: [] },
        alphaNumRx = /\w/,
        currentNode = parseTree,
        text = '',
        tag = '',
        newNode;

    function parseTag(token) {
        if (token === '/') {
            return parseCloseTag;
        }

        i--; //backtrack to first tag character
        return parseOpenTag;
    }

    function parseCloseTag(token) {
        if (token === '>') {
            if (currentNode.tag !== tag) {
                throw 'Wrong closed tag at char ' + i;
            }

            tag = '';

            nodesStack.pop();

            currentNode = currentNode.parentNode;

            return parseText;            
        }

        assertValidTagNameChar(token);

        tag += token;

        return parseCloseTag;
    }

    function parseOpenTag(token) {
        if (token === '>') {
            currentNode.cn.push(newNode = { tag: tag, parentNode: currentNode,  cn: []});
            nodesStack.push(currentNode = newNode);

            tag = '';

            return parseText;
        }

        assertValidTagNameChar(token);

        tag += token;

        return parseOpenTag;
    }

    function parseText(token) {
        if (token === '<') {

            if (text) {
                currentNode.cn.push(text);
                text = '';
            }

            return parseTag;
        }

        text += token;

        return parseText;
    }

    function assertValidTagNameChar(c) {
        if (!alphaNumRx.test(c)) {
            throw 'Invalid tag name char at ' + i;
        }
    }

    return function (html) {
        for (; i < len; i++) {
            stateFn = stateFn(html[i]);
        }

        if (currentNode = nodesStack.pop()) {
            throw 'Unbalanced tags: ' + currentNode.tag + ' is never closed.';
        }

        return parseTree;
    };
})();

console.log(parseHTML(html));

Parsing simple HTML into tree

Question

1 answers

solution1
2 2013-03-28 07:34:47

Parsing simple HTML into tree

Question

1 answers

solution1 2 2013-03-28 07:34:47

solution1
2 2013-03-28 07:34:47