简体   繁体   English

Java PHP中的正则表达式类似输出

[英]Regex in Java PHP like output

So here is the plan. 所以这是计划。

  1. Get the sourcecode from a weburl's html page eg https://stackoverflow.com 从weburl的html页面获取源代码,例如https://stackoverflow.com
  2. Search through it with a pattern eg for links 用模式搜索它,例如链接
  3. Get a hashmap/list of the results in the correct order of appearance 按正确的出现顺序获取哈希图/结果列表

In PHP very simple to make, but it seems to me impossible with native Java. 用PHP制作起来非常简单,但是在我看来,使用本机Java是不可能的。

How it would look like in PHP: 在PHP中的样子:

<?php
//the html code, maybe multilined
$htmlCode = file_get_contents("https://stackoverflow.com");

$replacement = Array("\n", "\cr", "\r", "\c", "\t");
//to make it single lined like a long string
$htmlCode = str_replace($replacement, "", $htmlCode);

//Regex part
$pattern = '#<a href="(.*)">.*</a>#siU';
preg_match_all($pattern, $htmlCode, $results, PREG_SET_ORDER);

//Print it out
echo "<pre>Results: ".print_r($results,true)."</pre>";
?>

Results would look like: 结果如下:

Array = ( [0] = " http://abc.net ", [1] = " http://test.com ", ... ); Array =([0] =“ http://abc.net ”,[1] =“ http://test.com ”,...);

How can I do the same in Java? 如何在Java中做同样的事情?

(I already tried it my own with a way I thought how it could work, but it I think it's to overblown. Did it with JerichoHTMLParser.) (我已经以一种自己认为可行的方式尝试了它,但是我认为它已经被夸大了。使用JerichoHTMLParser做到了。)

Init.java 初始化程序

package test.java.regex;

import java.awt.*;
import java.awt.event.*;
import javax.swing.*;
import net.htmlparser.jericho.*;
import java.net.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;

public class init extends JFrame {

    private JLabel lblKeyword;
    private JTextField keyword;
    private JButton exec;
    private JScrollPane sp;
    private JTextArea output;
    private Pattern pattern;
    private Matcher matcher;
    private static final String thePATTERN = "<a href=\"(http://[^\"]+)\".*</a>";

    public init(){
        this.pattern = Pattern.compile(thePATTERN);
        this.setTitle("HTML Element Link Position");
        this.setResizable(true);
        this.setLayout(new GridBagLayout());
        GridBagConstraints gbc;

        this.lblKeyword = new JLabel("Search for:");
        gbc=makeGBC(0, 0, 1, 1);
        gbc.anchor = GridBagConstraints.WEST;
        this.add(this.lblKeyword,gbc);
        this.keyword = new JTextField();
        this.keyword.setText("test");
        gbc=makeGBC(0, 1, 1, 1);
        gbc.fill=GridBagConstraints.HORIZONTAL;
        this.add(this.keyword, gbc);
        this.exec = new JButton("Execute");
        gbc=makeGBC(0, 2, 0, 1);
        gbc.anchor = GridBagConstraints.EAST;
        gbc.fill=GridBagConstraints.BOTH;
        gbc.weightx=1.0;
        gbc.weighty=0.1;
        this.add(this.exec, gbc);
        this.output = new JTextArea();
        gbc=makeGBC(0, 3, 1, 2);
        gbc.fill=GridBagConstraints.BOTH;
        gbc.weightx=1.0;
        gbc.weighty=1.0;
        this.add(this.output, gbc);
        this.sp = new JScrollPane(this.output);
        this.sp.setPreferredSize(new Dimension(500,100));
        gbc = makeGBC(0, 3, 1, 1);
        gbc.fill=GridBagConstraints.BOTH;
        gbc.weightx=1.0;
        gbc.weighty=1.0;
        this.add(this.sp, gbc);

        this.exec.addActionListener(
            new ActionListener() {
                public void actionPerformed(ActionEvent e) {
                    String pos = getPosition();
                    System.out.println("Position: "+pos);
                    //output.setText( pos );
                }
        });

        this.addWindowListener(new WindowAdapter() {
            public void windowClosing(WindowEvent e) {
                System.exit(0);
            }
        });
        this.setPreferredSize(new Dimension(700,600));
        this.pack();
        this.setLocation
        (
                (Toolkit.getDefaultToolkit().getScreenSize().width-this.getWidth())/2,
                (Toolkit.getDefaultToolkit().getScreenSize().height-this.getHeight())/2
        );
        this.setVisible(true);
    }


    private String getPosition()
    {
        String urlString = "";
        String result="";
        if( !this.keyword.getText().isEmpty() )
        {
            URL url;
            URLConnection uc;
            StringBuilder parsedContentFromUrl = new StringBuilder();

            urlString = "http://stackoverflow.com";

            MicrosoftConditionalCommentTagTypes.register();
            PHPTagTypes.register();
            PHPTagTypes.PHP_SHORT.deregister();
            MasonTagTypes.register();

            try{
                url = new URL( urlString );
                uc = url.openConnection();
                uc.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0");
                uc.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
                uc.connect();
                uc.getInputStream();
                BufferedInputStream in = new BufferedInputStream(uc.getInputStream());
                int ch;
                while ((ch = in.read()) != -1) {
                    parsedContentFromUrl.append((char) ch);
                }
                Source source = new Source( parsedContentFromUrl );
                result = source.toString();
                this.output.setText( result );
                matcher = this.pattern.matcher(result);
                boolean matches = matcher.matches();
                if( matches ){
                this.output.setText( "found" );
                }
//              BufferedReader inl=null;
//              inl = new BufferedReader( new InputStreamReader(uc.getInputStream()) );
//                StringBuffer sb = new StringBuffer("");
//                String line = "";
//                //String NL = System.getProperty("line.separator");
//                while ((line = inl.readLine()) != null) {
//                    sb.append(line);
//                }
//                inl.close();
//                result = sb.toString();

//              BufferedInputStream in1 = new BufferedInputStream(uc.getInputStream());
//                ByteArrayOutputStream baos = new ByteArrayOutputStream();
//                int ch1;
//                while ((ch1 = in1.read()) != -1) {
//                    baos.write((byte)ch1);
//                }
//                baos.close();
//                String st = new String(baos.toByteArray(), "UTF-8");

                //result = source.getRenderer().toString();

            }
            catch (Exception e) {
                result = e.toString();
            }
            //

            return result;
        }else{
            return "No keyword given";
        }
    }

    private GridBagConstraints makeGBC(int gx, int gy, int gw, int gh)
    {
        GridBagConstraints gbc=new GridBagConstraints();
        gbc.gridx=gx;
        gbc.gridy=gy;
        gbc.gridwidth=gw;
        gbc.gridheight=gh;
        gbc.fill=GridBagConstraints.NONE;
        gbc.weightx=0;
        gbc.weighty=0;
        gbc.anchor=GridBagConstraints.CENTER;
        gbc.insets=new Insets(2,2,2,2);
        return gbc;
    }

    public static void main(String[] args) {
        try {
            for (LookAndFeelInfo laf : UIManager.getInstalledLookAndFeels()) {
                if ("Nimbus".equals(laf.getName())) {
                    UIManager.setLookAndFeel(laf.getClassName());
                    break;
                }
            }
        } catch (Exception e) {
            // If Nimbus is not available, you can set the GUI to another look and feel.
        }
        new init();
    }
}

Thanks in advance for any help. 在此先感谢您的帮助。 :) :)

Yes it may be done shorter, but let's start with that code, as it is almost there. 是的,它可以做的更短一些,但是让我们从该代码开始,因为它已经差不多了。

uc.getInputStream();

Can be removed of course, as it is fetched & used the next line. 当然可以删除,因为它已提取并用于下一行。

 boolean matches = matcher.matches();
 if (matches) {
     this.output.setText( "found" );
}

has an error: in java .match requires a match for the entire string. 有一个错误:在java .match需要对整个字符串进行匹配。

while (matcher.find()) {
    output.setText(matcher.group(1));
    // Or something different, as this keeps only the last one found.
}

And I am not ackquainted with Source. 我不了解Source。

One could read all in a ByteArrayOutputStream as you did too. 您也可以像这样在ByteArrayOutputStream中读取所有内容。

String st = new String(baos.toByteArray(), StandardCharsets.ISO_8859_1);

You could use ISO-8859-1 as this would not encoding errors which UTF-8 might encounter when the page is not in UTF-8. 您可以使用ISO-8859-1,因为这不会编码当页面不在 UTF-8中时UTF-8可能遇到的错误。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM