简体   繁体   中英

Crawler4j With Grails App

I am making a crawler application in Groovy on Grails. I am using Crawler4j and following this tutorial .

  1. I created a new grails project
  2. Put the BasicCrawlController.groovy file in controllers->package
  3. Did not create any view because I expected on doing run-app, my crawled data would appear in my crawlStorageFolder (please correct me if my understanding is flawed)

After that I just ran the application by doing run-app but I didn't see any crawling data anywhere.

  1. Am I right in expecting some file to be created at the crawlStorageFolder location that I have given as C:/crawl/crawler4jStorage?
  2. Do I need to create any view for this?
  3. If I want to invoke this crawler controller from some other view on click of a submit button of a form, can I just write <g:form name="submitWebsite" url="[controller:'BasicCrawlController ']"> ?

I asked this because I do not have any method in this controller, so is it the right way to invoke this controller?

My code is as follows:

//All necessary imports  



    public class BasicCrawlController {
        static main(args) throws Exception {
            String crawlStorageFolder = "C:/crawl/crawler4jStorage";
            int numberOfCrawlers = 1;
            //int maxDepthOfCrawling = -1;    default
            CrawlConfig config = new CrawlConfig();
            config.setCrawlStorageFolder(crawlStorageFolder);
            config.setPolitenessDelay(1000);
            config.setMaxPagesToFetch(100);
            config.setResumableCrawling(false);
            PageFetcher pageFetcher = new PageFetcher(config);
            RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
            RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
            CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
            controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
            controller.start(BasicCrawler.class, 1);

        }
    }


    class BasicCrawler extends WebCrawler {

    final static Pattern FILTERS = Pattern
    .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
             "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")

    /**
     * You should implement this function to specify whether the given url
     * should be crawled or not (based on your crawling logic).
     */
    @Override
    boolean shouldVisit(WebURL url) {
        String href = url.getURL().toLowerCase()
        !FILTERS.matcher(href).matches() &&       href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
    }

    /**
     * This function is called when a page is fetched and ready to be processed
     * by your program.
     */
    @Override
    void visit(Page page) {
        int docid = page.getWebURL().getDocid()
        String url = page.getWebURL().getURL()
        String domain = page.getWebURL().getDomain()
        String path = page.getWebURL().getPath()
        String subDomain = page.getWebURL().getSubDomain()
        String parentUrl = page.getWebURL().getParentUrl()
        String anchor = page.getWebURL().getAnchor()

        println("Docid: ${docid} ")
        println("URL: ${url}  ")
        println("Domain: '${domain}'")
        println("Sub-domain: ' ${subDomain}'")
        println("Path: '${path}'")
        println("Parent page:${parentUrl}  ")
        println("Anchor text: ${anchor} " )

        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
            String text = htmlParseData.getText()
            String html = htmlParseData.getHtml()
            List<WebURL> links = htmlParseData.getOutgoingUrls()

            println("Text length: " + text.length())
            println("Html length: " + html.length())
            println("Number of outgoing links: " + links.size())
        }
        Header[] responseHeaders = page.getFetchResponseHeaders()
        if (responseHeaders != null) {
            println("Response headers:")
            for (Header header : responseHeaders) {
                println("\t ${header.getName()} : ${header.getValue()}")
            }
        }
        println("=============")
    }
}

I'll try to translate your code into a Grails standard.

Use this under grails-app/controller

class BasicCrawlController {

   def index() {
        String crawlStorageFolder = "C:/crawl/crawler4jStorage";
        int numberOfCrawlers = 1;
        //int maxDepthOfCrawling = -1;    default
        CrawlConfig crawlConfig = new CrawlConfig();
        crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
        crawlConfig.setPolitenessDelay(1000);
        crawlConfig.setMaxPagesToFetch(100);
        crawlConfig.setResumableCrawling(false);
        PageFetcher pageFetcher = new PageFetcher(crawlConfig);
        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
        CrawlController controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
        controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
        controller.start(BasicCrawler.class, 1);

        render "done crawling"

    }
}

Use this under src/groovy

class BasicCrawler extends WebCrawler {

final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
         "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")

/**
 * You should implement this function to specify whether the given url
 * should be crawled or not (based on your crawling logic).
 */
@Override
boolean shouldVisit(WebURL url) {
    String href = url.getURL().toLowerCase()
    !FILTERS.matcher(href).matches() &&       href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.
 */
@Override
void visit(Page page) {
    int docid = page.getWebURL().getDocid()
    String url = page.getWebURL().getURL()
    String domain = page.getWebURL().getDomain()
    String path = page.getWebURL().getPath()
    String subDomain = page.getWebURL().getSubDomain()
    String parentUrl = page.getWebURL().getParentUrl()
    String anchor = page.getWebURL().getAnchor()

    println("Docid: ${docid} ")
    println("URL: ${url}  ")
    println("Domain: '${domain}'")
    println("Sub-domain: ' ${subDomain}'")
    println("Path: '${path}'")
    println("Parent page:${parentUrl}  ")
    println("Anchor text: ${anchor} " )

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
        String text = htmlParseData.getText()
        String html = htmlParseData.getHtml()
        List<WebURL> links = htmlParseData.getOutgoingUrls()

        println("Text length: " + text.length())
        println("Html length: " + html.length())
        println("Number of outgoing links: " + links.size())
    }
    Header[] responseHeaders = page.getFetchResponseHeaders()
    if (responseHeaders != null) {
        println("Response headers:")
        for (Header header : responseHeaders) {
            println("\t ${header.getName()} : ${header.getValue()}")
        }
    }
    println("=============")
  }
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM