简体   繁体   中英

How to crawl multiple URLs at the same time using Java/JSoup

So here is my dilemma, I am trying to create a web crawler that grabs PDF links off of an entire website but as my code below shows I am only able to crawl through one specific page instead of the entire website. What I would like my code to do is to crawl through the initial url for PDF links (which it already does) and then search for more PDF links throughout the entire website. Can somebody show me exactly what I am doing wrong or what I need to add? I would really appreciate it.

public class Crawler {

    /**
    * @param args the command line arguments
    * @throws java.io.IOException
    */
    public static void main(String[] args) throws IOException {

        String url = "http://www.tuskegee.edu";
        print("Fetching %s...", url);

        Document doc = Jsoup.connect(url).timeout(0).get();
        Elements media = doc.select("[src]");
        Elements imports = doc.select("link[href]");
        Elements links1 = doc.select("a[href]");


        print("\nMedia: (%d)", media.size());
        for (Element src : media) {
            if (src.tagName().equals("img"))
                print(" * %s: <%s> %sx%s (%s)",
                   src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
                   trim(src.attr("alt"), 20));
            else
                print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
        }

        print("\nImports: (%d)", imports.size());
        for (Element link : imports) {
            print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
        }
        print("\nLinks: (%d)", links1.size());
        for (Element link: links1){
            print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
        }
    }

    private static void print(String msg, Object... args) {
        System.out.println(String.format(msg, args));
    }

    private static String trim(String s, int width) {
        if (s.length() > width)
            return s.substring(0, width-1) + ".";
        else
            return s;
    }
}

This should do it:

public class Crawler {

/**
 * @param args the command line arguments
 * @throws java.io.IOException
 */
public static void main(String[] args) throws IOException {

    Set<String> visitedUrls = new HashSet<>();
    String url = "http://www.tuskegee.edu";
    crawl(url, visitedUrls);
}

private static void crawl(String url, Set<String> visited) throws IOException {
    if(url.isEmpty() || visited.contains(url)) {
        return;
    }
    print("Fetching %s...", url);
    visited.add(url);
    Document doc;
    try {
        doc = Jsoup.connect(url).timeout(10000).get();
    } catch (UnsupportedMimeTypeException e) {
        System.out.println("Unsupported Mime type. Aborting crawling for URL: " + url);
        return;
    } catch (MalformedURLException e) {
        System.out.println("Unsupported protocol for URL: " + url);
        return;
    } catch (HttpStatusException e) {
        System.out.println("Error (status=" + e.getStatusCode() + ") fetching URL: " + url);
        return;
    } catch (IOException e) {
        System.out.println("Timeout fetching URL: " + url);
        return;
    }

    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
    Elements links1 = doc.select("a[href]");

    print("\nMedia: (%d)", media.size());
    for (Element src : media) {
        if (src.tagName().equals("img"))
            print(" * %s: <%s> %sx%s (%s)",
                  src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
                  trim(src.attr("alt"), 20));
        else
            print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
    }

    print("\nImports: (%d)", imports.size());
    for (Element link : imports) {
        print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
    }
    print("\nLinks: (%d)", links1.size());
    for (Element link: links1){
        print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
    }

    for(Element link : links1) {
        String href = link.attr("abs:href");
        URL hrefURL = null;
        try {
            hrefURL = new URL(href);
        } catch (MalformedURLException e) {
            //nothing
        }
        if(hrefURL != null && hrefURL.getHost().equals(new URL(url).getHost())) {
            crawl(href, visited);
        }
    }
}

private static void print(String msg, Object... args) {
    System.out.println(String.format(msg, args));
}

private static String trim(String s, int width) {
    if (s.length() > width)
        return s.substring(0, width-1) + ".";
    else
        return s;
}

}

Notice the last FOR loop added to your original code.

EDIT: Added visited URL tracking to avoid infinite loops if visited url is found.

EDIT2: Added some error handling and Domain restrictions cause you gotta be careful or you may end up crawling the whole Internet!

You still need to extract what you really looking for and save it somewhere.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM