简体   繁体   中英

How to extract the main content from a webpage?

I am trying to write a summary of the content of a web page. For that I need to extract all the irrelevant text and data from a webpage.

I have used boilerpipe, but the text extraction is not good.The results are here , where you can see lot of irrelevant text.

Also tried JSoup to scrap away irrelevant data, by removing headers, footers, external links, etc. But again, the results are not up to the mark.

    Document doc = Jsoup.connect("www.anyurl.com").get()
    doc.head().remove();
    doc.getElementsByTag("header").remove();
    doc.getElementsByTag("footer").remove();
    doc.getElementsByTag("form").remove();
    doc.getElementsByTag("table").remove();
    doc.getElementsByTag("meta").remove();
    doc.getElementsByTag("img").remove();
    doc.getElementsByTag("a").remove();
    doc.getElementsByTag("br").remove();

    doc.getElementsByClass("tags").remove();
    doc.getElementsByClass("copyright").remove();
    doc.getElementsByClass("widget").remove();

    doc.select("div[class*=foot").remove();
    doc.select("div[class*=tag").remove();
    doc.select("div[class*=Loading").remove();
    doc.select("div[class*=Widget").remove();
    doc.select("div[class*=Head").remove();
    doc.select("div[class*=menu").remove();
    doc.select("p[class*=link").remove();

    Elements paragraphs = doc.select("p");
    Elements divs = doc.select("div");

    formattedOutput = paragraphs.text() + divs.text();

Can anyone suggest me how to get this done? Is there any Java library other than boilerpipe, which does it for you?

I don't about java but you can use extract the main content from a webpage

<?php

class ContentExtractor {

    var $container_tags = array(
            'div', 'table', 'td', 'th', 'tr', 'tbody', 'thead', 'tfoot', 'col', 
            'colgroup', 'ul', 'ol', 'html', 'center', 'span'
        );
    var $removed_tags = array(
            'script', 'noscript', 'style', 'form', 'meta', 'input', 'iframe', 'embed', 'hr', 'img',
            '#comment', 'link', 'label'
        );
    var $ignore_len_tags = array(
            'span'
        );  

    var $link_text_ratio = 0.04;
    var $min_text_len = 20;
    var $min_words = 0; 

    var $total_links = 0;
    var $total_unlinked_words = 0;
    var $total_unlinked_text='';
    var $text_blocks = 0;

    var $tree = null;
    var $unremoved=array();

    function sanitize_text($text){
        $text = str_ireplace('&nbsp;', ' ', $text);
        $text = html_entity_decode($text, ENT_QUOTES);

        $utf_spaces = array("\xC2\xA0", "\xE1\x9A\x80", "\xE2\x80\x83", 
            "\xE2\x80\x82", "\xE2\x80\x84", "\xE2\x80\xAF", "\xA0");
        $text = str_replace($utf_spaces, ' ', $text);

        return trim($text);
    }

    function extract($text, $ratio = null, $min_len = null){
        $this->tree = new DOMDocument();

        $start = microtime(true);
        if (!@$this->tree->loadHTML($text)) return false;

        $root = $this->tree->documentElement;
        $start = microtime(true);
        $this->HeuristicRemove($root, ( ($ratio == null) || ($min_len == null) ));

        if ($ratio == null) {
            $this->total_unlinked_text = $this->sanitize_text($this->total_unlinked_text);

            $words = preg_split('/[\s\r\n\t\|?!.,]+/', $this->total_unlinked_text);
            $words = array_filter($words);
            $this->total_unlinked_words = count($words);
            unset($words);
            if ($this->total_unlinked_words>0) {
                $this->link_text_ratio = $this->total_links / $this->total_unlinked_words;// + 0.01;
                $this->link_text_ratio *= 1.3;
            }

        } else {
            $this->link_text_ratio = $ratio;
        };

        if ($min_len == null) {
            $this->min_text_len = strlen($this->total_unlinked_text)/$this->text_blocks;
        } else {
            $this->min_text_len = $min_len;
        }

        $start = microtime(true);
        $this->ContainerRemove($root);

        return $this->tree->saveHTML();
    }

    function HeuristicRemove($node, $do_stats = false){
        if (in_array($node->nodeName, $this->removed_tags)){
            return true;
        };

        if ($do_stats) {
            if ($node->nodeName == 'a') {
                $this->total_links++;
            }
            $found_text = false;
        };

        $nodes_to_remove = array();

        if ($node->hasChildNodes()){
            foreach($node->childNodes as $child){
                if ($this->HeuristicRemove($child, $do_stats)) {
                    $nodes_to_remove[] = $child;
                } else if ( $do_stats && ($node->nodeName != 'a') && ($child->nodeName == '#text') ) {
                    $this->total_unlinked_text .= $child->wholeText;
                    if (!$found_text){
                        $this->text_blocks++;
                        $found_text=true;
                    }
                };
            }
            foreach ($nodes_to_remove as $child){
                $node->removeChild($child);
            }
        }

        return false;
    }

    function ContainerRemove($node){
        if (is_null($node)) return 0;
        $link_cnt = 0;
        $word_cnt = 0;
        $text_len = 0;
        $delete = false;
        $my_text = '';

        $ratio = 1;

        $nodes_to_remove = array();
        if ($node->hasChildNodes()){
            foreach($node->childNodes as $child){
                $data = $this->ContainerRemove($child);

                if ($data['delete']) {
                    $nodes_to_remove[]=$child;
                } else {
                    $text_len += $data[2];
                }

                $link_cnt += $data[0];

                if ($child->nodeName == 'a') {
                    $link_cnt++;
                } else {
                    if ($child->nodeName == '#text') $my_text .= $child->wholeText;
                    $word_cnt += $data[1];
                }
            }

            foreach ($nodes_to_remove as $child){
                $node->removeChild($child);
            }

            $my_text = $this->sanitize_text($my_text);

            $words = preg_split('/[\s\r\n\t\|?!.,\[\]]+/', $my_text);
            $words = array_filter($words);

            $word_cnt += count($words);
            $text_len += strlen($my_text);

        };

        if (in_array($node->nodeName, $this->container_tags)){
            if ($word_cnt>0) $ratio = $link_cnt/$word_cnt;

            if ($ratio > $this->link_text_ratio){
                    $delete = true;
            }

            if ( !in_array($node->nodeName, $this->ignore_len_tags) ) {
                if ( ($text_len < $this->min_text_len) || ($word_cnt<$this->min_words) ) {
                    $delete = true;
                }
            }

        }   

        return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete);
    }

}

/****************************
    Simple usage example
*****************************/

$html = file_get_contents('http://en.wikipedia.org/wiki/Shannon_index');

$extractor = new ContentExtractor();
$content = $extractor->extract($html);
echo $content;

?>

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM