简体   繁体   中英

crawler for gwt application taking too much time

i have a gwt application that i need to optimize for seo ( crawl the content for google), and i have been trying many solutions wich are not meeting our needs (it's taking us a big amount of time to return the html page), the trials are:

  1. I tried to use htmlUnit as headless browser to crawl the page on demand, it takes about 15 second to get the html content (when auditing this timing, it results that 80% of this timing is taken by a loop that waits for background javascript "while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) ")
  2. A technic that consists on crawling the page prior to google request, then giving the saved snapshot when google is asking for it (but this solution is definitely not convenient because the content changes very frequently and google may consider this as a "CLOACKING")

Any suggestion?

the code used to crawl:

public class CrawlFilter implements Filter {
    private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
        private static final long serialVersionUID = 1L;

    @Override
    public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
        return true;
    }
}

private final Logger log = Logger.getLogger(CrawlFilter.class.getName());

/**
 * Special URL token that gets passed from the crawler to the servlet
 * filter. This token is used in case there are already existing query
 * parameters.
 */
private static final String ESCAPED_FRAGMENT_FORMAT1 = "_escaped_fragment_=";
private static final int ESCAPED_FRAGMENT_LENGTH1 = ESCAPED_FRAGMENT_FORMAT1.length();
/**
 * Special URL token that gets passed from the crawler to the servlet
 * filter. This token is used in case there are not already existing query
 * parameters.
 */
private static final String ESCAPED_FRAGMENT_FORMAT2 = "&" + ESCAPED_FRAGMENT_FORMAT1;
private static final int ESCAPED_FRAGMENT_LENGTH2 = ESCAPED_FRAGMENT_FORMAT2.length();

private static final long _pumpEventLoopTimeoutMillis = 30000;
private static final long _jsTimeoutMillis = 1000;
private static final long _pageWaitMillis = 200;
private static final int _maxLoopChecks = 2;

private WebClient webClient;

public void doFilter(ServletRequest request, ServletResponse response,
                     FilterChain filterChain) throws IOException, ServletException {
    // Grab the request uri and query strings.
    final HttpServletRequest httpRequest = (HttpServletRequest) request;
    final String requestURI = httpRequest.getRequestURI();
    final String queryString = httpRequest.getQueryString();
    final HttpServletResponse httpResponse = (HttpServletResponse) response;

    if ((queryString != null) && (queryString.contains(ESCAPED_FRAGMENT_FORMAT1))) {
        final int port = httpRequest.getServerPort();
        final String urlStringWithHashFragment = requestURI + rewriteQueryString(queryString);
        final String scheme = httpRequest.getScheme();
        final URL urlWithHashFragment = new URL(scheme, "127.0.0.1", port, urlStringWithHashFragment);
        final WebRequest webRequest = new WebRequest(urlWithHashFragment);

        log.fine("Crawl filter encountered escaped fragment, will open: " + webRequest.toString());

        httpResponse.setContentType("text/html;charset=UTF-8");
        final PrintWriter out = httpResponse.getWriter();
        out.println(renderPage(webRequest));
        out.flush();
        out.close();

        log.fine("HtmlUnit completed webClient.getPage(webRequest) where webRequest = " + webRequest.toString());
    } else {
        filterChain.doFilter(request, response);
    }
}

@Override
public void destroy() {
    if (webClient != null) {
        webClient.closeAllWindows();
    }
}

@Override
public void init(FilterConfig config) throws ServletException {
}

private StringBuilder renderPage(WebRequest webRequest) throws IOException {
    webClient = new WebClient(BrowserVersion.FIREFOX_17);
    webClient.getCache().clear();
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setJavaScriptEnabled(true);
    webClient.getOptions().setThrowExceptionOnScriptError(false);
    webClient.getOptions().setRedirectEnabled(false);
    webClient.setAjaxController(new SyncAllAjaxController());
    webClient.setCssErrorHandler(new SilentCssErrorHandler());

    final HtmlPage page = webClient.getPage(webRequest);
    webClient.getJavaScriptEngine().pumpEventLoop(_pumpEventLoopTimeoutMillis);

    int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);
    int loopCount = 0;

    while (waitForBackgroundJavaScript > 0 && loopCount < _maxLoopChecks) {
        ++loopCount;
        waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(_jsTimeoutMillis);

        if (waitForBackgroundJavaScript == 0) {
            log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
            break;
        }

        synchronized (page) {
            log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
            try {
                page.wait(_pageWaitMillis);
            } catch (InterruptedException e) {
                log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
            }
        }
    }

    webClient.getAjaxController().processSynchron(page, webRequest, false);
    if (webClient.getJavaScriptEngine().isScriptRunning()) {
        log.warning("HtmlUnit webClient.getJavaScriptEngine().shutdownJavaScriptExecutor()");
        webClient.getJavaScriptEngine().shutdownJavaScriptExecutor();
    }

    final String staticSnapshotHtml = page.asXml();
    StringBuilder stringBuilder = new StringBuilder();
    stringBuilder.append("<hr />\n");
    stringBuilder.append("<center><h3>This is a non-interactive snapshot for crawlers. Follow <a href=\"");
    stringBuilder.append(webRequest.getUrl() + "\">this link</a> for the interactive application.<br></h3></center>");
    stringBuilder.append("<hr />");
    stringBuilder.append(staticSnapshotHtml);

    return stringBuilder;
}

/**
 * Maps from the query string that contains _escaped_fragment_ to one that
 * doesn't, but is instead followed by a hash fragment. It also unescapes any
 * characters that were escaped by the crawler. If the query string does not
 * contain _escaped_fragment_, it is not modified.
 *
 * @param queryString
 * @return A modified query string followed by a hash fragment if applicable.
 *         The non-modified query string otherwise.
 * @throws UnsupportedEncodingException
 */
private static String rewriteQueryString(String queryString) throws UnsupportedEncodingException {
    int index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT2);
    int length = ESCAPED_FRAGMENT_LENGTH2;

    if (index == -1) {
        index = queryString.indexOf(ESCAPED_FRAGMENT_FORMAT1);
        length = ESCAPED_FRAGMENT_LENGTH1;
    }

    if (index != -1) {
        StringBuilder queryStringSb = new StringBuilder();
        if (index > 0) {
            queryStringSb.append("?");
            queryStringSb.append(queryString.substring(0, index));
        }
        queryStringSb.append("#!");
        queryStringSb.append(URLDecoder.decode(queryString.substring(index
                + length, queryString.length()), "UTF-8"));
        return queryStringSb.toString();
    }

    return queryString;
}
}

I suggest having HtmlUnit generate the static html offline. You control the update frequency.

Then, have your servlet filter intercepting the crawler request return the already generated static html.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM