简体   繁体   中英

Extract URL from string of HTML

I am trying to extract the URL from the given String , which contain the HTTP response with HREF tag. I have reached the beginning of the links but I need to terminate the string as soon as the HREF ends. How this could be achieved?

public class Extracturl {
public static void main(String[] args) throws IOException {
    // TODO Auto-generated method stub
    String line;

    try {
        String u="http://en.wikipedia.org/wiki/china";
        String fileName = "e:\\test.txt";
         BufferedWriter writer = new BufferedWriter(new FileWriter(fileName,true));
        url = new URL(u);
        is = url.openStream();  // throws an IOException
        dis = new DataInputStream(new BufferedInputStream(is));

        String w=new String();
        while ((line = dis.readLine()) != null) {


                try {
   if(line.contains("href=\"/wiki")&&line.contains("\" />")&& (!line.contains("File")))
                    {   

                    if(!w.contains(line.substring(line.indexOf("href=\"/"))))
                    {w=w+line.substring(line.indexOf("href=\"/"));                        
                        System.out.println(line.substring(line.indexOf("href=\"/"))); 
                    writer.write(w);
                    writer.newLine();
                    }}
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }
    } catch (MalformedURLException mue) {
         mue.printStackTrace();
    } catch (IOException ioe) {
         ioe.printStackTrace();
    } finally {
        try {
            is.close();

           // writer.close();
        } catch (IOException ioe) {
            // nothing to see here
        }
    }
}

    }

I even tried

   w=w+line.substring(line.indexOf("href=\"/"),line.indexOf("\">"));

But this gave me error.

My aim is to get all the URLs which are linked from the page.

Use an HTML parser for that purpose. Here is an example with the embedded Java HTML parser. There are other alternatives like JSoup , but for basic HTML handling, this one does a pretty good job:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.Set;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

public class URLExtractor {

    private static class HTMLPaserCallBack extends HTMLEditorKit.ParserCallback {

        private Set<String> urls;

        public HTMLPaserCallBack() {
            urls = new LinkedHashSet<String>();
        }

        public Set<String> getUrls() {
            return urls;
        }

        @Override
        public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
            handleTag(t, a, pos);
        }

        @Override
        public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
            handleTag(t, a, pos);
        }

        private void handleTag(Tag t, MutableAttributeSet a, int pos) {
            if (t == Tag.A) {
                Object href = a.getAttribute(HTML.Attribute.HREF);
                if (href != null) {
                    String url = href.toString();
                    if (!urls.contains(url)) {
                        urls.add(url);
                    }
                }
            }
        }
    }

    public static void main(String[] args) throws IOException {
        InputStream is = null;
        try {
            String u = "http://en.wikipedia.org/wiki/china";
            URL url = new URL(u);
            is = url.openStream(); // throws an IOException
            HTMLPaserCallBack cb = new HTMLPaserCallBack();
            new ParserDelegator().parse(new BufferedReader(new InputStreamReader(is)), cb, true);
            for (String aUrl : cb.getUrls()) {
                System.out.println("Found URL: " + aUrl);
            }
        } catch (MalformedURLException mue) {
            mue.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                is.close();
            } catch (IOException ioe) {
                // nothing to see here
            }
        }
    }
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM