[英]crawler for gwt application taking too much time
i have a gwt application that i need to optimize for seo ( crawl the content for google), and i have been trying many solutions wich are not meeting our needs (it's taking us a big amount of time to return the html page), the trials are: 我有一个gwt应用程序,我需要针对seo进行优化(抓取google的内容),并且我一直在尝试许多解决方案,这些解决方案无法满足我们的需求(这需要花费大量时间才能返回html页面),试用是:
Any suggestion? 有什么建议吗?
the code used to crawl: 用于抓取的代码:
public class CrawlFilter implements Filter {
private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
private static final long serialVersionUID = 1L;
@Override
public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
return true;
}
}
private final Logger log = Logger.getLogger(CrawlFilter.class.getName());
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
* Special URL token that gets passed from the crawler to the servlet
* filter. This token is used in case there are not already existing query
* parameters.
*/
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();
private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
private static final int _maxLoopChecks = 2;
private WebClient webClient;
public void doFilter(ServletRequest request, ServletResponse response,
FilterChain filterChain) throws IOException, ServletException {
// Grab the request uri and query strings.
final HttpServletRequest httpRequest = (HttpServletRequest) request;
final String requestURI = httpRequest.getRequestURI();
final String queryString = httpRequest.getQueryString();
final HttpServletResponse httpResponse = (HttpServletResponse) response;
if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
final int port = httpRequest.getServerPort();
final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
final String scheme = httpRequest.getScheme();
final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
final WebRequest webRequest = new WebRequest(urlWithHashFragment);
log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());
httpResponse.setContentType("text/html;charset=UTF-8");
final PrintWriter out = httpResponse.getWriter();
out.println(renderPage(webRequest));
out.flush();
out.close();
log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
} else {
filterChain.doFilter(request, response);
}
}
@Override
public void destroy() {
if (webClient != null) {
webClient.closeAllWindows();
}
}
@Override
public void init(FilterConfig config) throws ServletException {
}
private StringBuilder renderPage(WebRequest webRequest) throws IOException {
webClient = new WebClient(BrowserVersion.FIREFOX_17);
webClient.getCache().clear();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setRedirectEnabled(false);
webClient.setAjaxController(new SyncAllAjaxController());
webClient.setCssErrorHandler(new SilentCssErrorHandler());
final HtmlPage page = webClient.getPage(webRequest);
webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);
int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
int loopCount = 0;
while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
++loopCount;
waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
if (waitForBackgroundJavaScript == 0) {
log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
break;
}
synchronized (page) {
log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
try {
page.wait(_pageWaitMillis);
} catch (InterruptedException e) {
log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
}
}
}
webClient.getAjaxController().processSynchron(page, webRequest, false);
if (webClient.getJavaScriptEngine().isScriptRunning()) {
log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
}
final String staticSnapshotHtml = page.asXml();
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
stringBuilder.append("<hr />");
stringBuilder.append(staticSnapshotHtml);
return stringBuilder;
}
/**
* Maps from the query string that contains _escaped_fragment_ to one that
* doesn't, but is instead followed by a hash fragment. It also unescapes any
* characters that were escaped by the crawler. If the query string does not
* contain _escaped_fragment_, it is not modified.
*
* @param queryString
* @return A modified query string followed by a hash fragment if applicable.
* The non-modified query string otherwise.
* @throws UnsupportedEncodingException
*/
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
int length = ESCAPED_FRAGMENT_LENGTH2;
if (index == -1) {
index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
length = ESCAPED_FRAGMENT_LENGTH1;
}
if (index != -1) {
StringBuilder queryStringSb = new StringBuilder();
if (index > 0) {
queryStringSb.append("?");
queryStringSb.append(queryString.substring(0, index));
}
queryStringSb.append("#!");
queryStringSb.append(URLDecoder.decode(queryString.substring(index
+ length, queryString.length()), "UTF-8"));
return queryStringSb.toString();
}
return queryString;
}
}
I suggest having HtmlUnit generate the static html offline. 我建议让HtmlUnit离线生成静态html。 You control the update frequency.
您可以控制更新频率。
Then, have your servlet filter intercepting the crawler request return the already generated static html. 然后,让您的Servlet过滤器拦截搜寻器请求,返回已经生成的静态html。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.