I am making a crawler application in Groovy on Grails. I am using Crawler4j and following this tutorial .
After that I just ran the application by doing run-app
but I didn't see any crawling data anywhere.
<g:form name="submitWebsite" url="[controller:'BasicCrawlController ']">
? I asked this because I do not have any method in this controller, so is it the right way to invoke this controller?
My code is as follows:
//All necessary imports
public class BasicCrawlController {
static main(args) throws Exception {
String crawlStorageFolder = "C:/crawl/crawler4jStorage";
int numberOfCrawlers = 1;
//int maxDepthOfCrawling = -1; default
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
config.setMaxPagesToFetch(100);
config.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
controller.start(BasicCrawler.class, 1);
}
}
class BasicCrawler extends WebCrawler {
final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
"|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase()
!FILTERS.matcher(href).matches() && href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
void visit(Page page) {
int docid = page.getWebURL().getDocid()
String url = page.getWebURL().getURL()
String domain = page.getWebURL().getDomain()
String path = page.getWebURL().getPath()
String subDomain = page.getWebURL().getSubDomain()
String parentUrl = page.getWebURL().getParentUrl()
String anchor = page.getWebURL().getAnchor()
println("Docid: ${docid} ")
println("URL: ${url} ")
println("Domain: '${domain}'")
println("Sub-domain: ' ${subDomain}'")
println("Path: '${path}'")
println("Parent page:${parentUrl} ")
println("Anchor text: ${anchor} " )
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
String text = htmlParseData.getText()
String html = htmlParseData.getHtml()
List<WebURL> links = htmlParseData.getOutgoingUrls()
println("Text length: " + text.length())
println("Html length: " + html.length())
println("Number of outgoing links: " + links.size())
}
Header[] responseHeaders = page.getFetchResponseHeaders()
if (responseHeaders != null) {
println("Response headers:")
for (Header header : responseHeaders) {
println("\t ${header.getName()} : ${header.getValue()}")
}
}
println("=============")
}
}
I'll try to translate your code into a Grails standard.
Use this under grails-app/controller
class BasicCrawlController {
def index() {
String crawlStorageFolder = "C:/crawl/crawler4jStorage";
int numberOfCrawlers = 1;
//int maxDepthOfCrawling = -1; default
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
crawlConfig.setPolitenessDelay(1000);
crawlConfig.setMaxPagesToFetch(100);
crawlConfig.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(crawlConfig);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
controller.start(BasicCrawler.class, 1);
render "done crawling"
}
}
Use this under src/groovy
class BasicCrawler extends WebCrawler {
final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
"|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase()
!FILTERS.matcher(href).matches() && href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
void visit(Page page) {
int docid = page.getWebURL().getDocid()
String url = page.getWebURL().getURL()
String domain = page.getWebURL().getDomain()
String path = page.getWebURL().getPath()
String subDomain = page.getWebURL().getSubDomain()
String parentUrl = page.getWebURL().getParentUrl()
String anchor = page.getWebURL().getAnchor()
println("Docid: ${docid} ")
println("URL: ${url} ")
println("Domain: '${domain}'")
println("Sub-domain: ' ${subDomain}'")
println("Path: '${path}'")
println("Parent page:${parentUrl} ")
println("Anchor text: ${anchor} " )
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
String text = htmlParseData.getText()
String html = htmlParseData.getHtml()
List<WebURL> links = htmlParseData.getOutgoingUrls()
println("Text length: " + text.length())
println("Html length: " + html.length())
println("Number of outgoing links: " + links.size())
}
Header[] responseHeaders = page.getFetchResponseHeaders()
if (responseHeaders != null) {
println("Response headers:")
for (Header header : responseHeaders) {
println("\t ${header.getName()} : ${header.getValue()}")
}
}
println("=============")
}
}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.