简体   繁体   中英

How can I make this faster at reading and writing?

public class DataMiner  {

private static BigData app = new BigData();
private static DomainOfConstants doc = new DomainOfConstants();
private static Logger log = Logger.getLogger(DataMiner.class);
private static DBManager conn = new DBManager();
private static java.sql.Connection con = null;
private static AmazonS3  s3Client;
private static Iterator<String> itr;
private static List<String> entries = new ArrayList<String>();
private static S3Object s3Object;
private static ObjectMetadata meta;
public static InputStream dataStream;
public static byte[] buffer = new byte[1024];
public static File file = new File(app.getCurrentPacsId()+".txt");



private static void obtainConnection(){
    conn.connection();
    entries = conn.grabDataSet();       
    conn.closeDb();
    downloadBucket();
}

/*
 * 
 * The Java heap size limits for Windows are:
 * maximum possible heap size on 32-bit Java: 1.8 GB
 * recommended heap size limit on 32-bit Java: 1.5 GB (or 1.8 GB with /3GB option)
 * 
 * */
/*-------------Download and un-zip backup file-------------*/
private static void downloadBucket(){

    try {
        app.setAwsCredentials(doc.getAccessKey(), doc.getSecretKey());
        s3Client = AmazonS3ClientBuilder.standard().withCredentials(new AWSStaticCredentialsProvider(app.getAwsCredentials())).withRegion(Regions.US_EAST_1).build();
        System.out.println("Connected to S3");
        itr = entries.iterator();
        while(itr.hasNext()){
            app.setBucketKey(itr.next());
            String key = app.getBucketKey();
            app.setCurrentPacsId(key);
            s3Object = s3Client.getObject(new GetObjectRequest(doc.getDesiredBucket(), app.getBucketKey()));
            try {
                ZipInputStream zis = new ZipInputStream(s3Object.getObjectContent());
                ZipEntry entry = zis.getNextEntry();
                extractObjects(buffer, s3Client, zis, entry);                   
            } catch (AmazonServiceException e) {
                log.error(e);
            } catch (SdkClientException e) {
                log.error(e);
            } catch (IOException e) {
                log.error(e);
            }
        }
        System.out.println("Processing complete");


    } catch (IllegalArgumentException e) {
        e.printStackTrace();
    } 
}

public static void extractObjects(byte[] buffer, AmazonS3 s3Client, ZipInputStream zis, ZipEntry entry) throws IOException {
    PipedOutputStream outputStream = null;
    PipedInputStream is = null;
    try {
        while (entry != null) 
        {
            String fileName = entry.getName();
            if (fileName == "lib") {
                fileName = entry.getName();
            }
            boolean containsBackup = fileName.contains(doc.getDesiredFile());

            if (containsBackup == true) {
                System.out.println("A back up file was found");
                long start = System.currentTimeMillis();
                formatSchemaName();
                System.out.println("Extracting :" + app.getCurrentPacsId());
                log.info("Extracting " + app.getCurrentPacsId() + ", 
                compressed: " + entry.getCompressedSize() + " bytes, 
                extracted: " + 
                entry.getSize() + " bytes");
         //ByteArrayOutputStream outputStream = new ByteArrayOutputStream();


                outputStream = new PipedOutputStream();
                is = new PipedInputStream(outputStream);

                int len;
                while ((len = zis.read(buffer)) >= 0) 
                {
                    outputStream.write(buffer, 0, len);
                }
   //InputStream is = new ByteArrayInputStream(outputStream.toByteArray());
                meta = new ObjectMetadata();
                meta.setContentLength(file.length());
                fileName = app.getCurrentPacsId();
                runDataConversion(is,s3Client,fileName);
                recordTime(start);
                is.close();
                outputStream.close();
                System.out.println("Unzip complete");               
            }
            else{
                System.out.println("No back up found");
            }
            entry = zis.getNextEntry();
        }
        zis.closeEntry();
        zis.close();
    } catch (AmazonServiceException e) {
        log.error(e);
    } catch (SdkClientException e) {
        log.error(e);
    }
}


/*------------Formating the replacment file name---------*/
private static void formatSchemaName(){
    String s3Key = app.getCurrentPacsId();
    String id = s3Key.replace(".zip", ".txt");
    id = id.substring(id.indexOf("_"));
    id = id.replaceFirst("_", "");
    app.setCurrentPacsId(id);
}

/*---------------Process the data file----------------------*/
private static void runDataConversion(PipedInputStream is, AmazonS3 s3Client, String fileName) {
    DataProcessor convert = new DataProcessor(s3Client);
    convert.downloadBucket(is,fileName);
}

/*-------Records execution time of program in min/sec------*/
private static void recordTime(long start) throws IOException {
    long end = System.currentTimeMillis();
    long minutes = TimeUnit.MILLISECONDS.toMinutes(end - start);
    long seconds = TimeUnit.MILLISECONDS.toSeconds(end - start);
    System.out.println("Execution speed "+ minutes + ":" + (seconds % 60) +" min/sec\n");
}

And here is the class that does some text file processing.The code is very slow overall when processing files up to 3.5gb. It takes 3 hours to do so while running. I have tried using piped streams over byte streams. Java heap size set to -xms2800m on a 64 bit JDK.

public class DataProcessor {

private static AmazonS3 s3Client;
private static ObjectMetadata meta;
private static DomainOfConstants doc = new DomainOfConstants();
private static BigData app = new BigData();
public static File file = new File(app.getCurrentPacsId()+".txt");
private static Logger log = Logger.getLogger(DataProcessor.class);

//Construct connection
public DataProcessor (AmazonS3 s3Client){
    this.s3Client = s3Client;
}

//
public void downloadBucket(PipedInputStream is, String fileName) {
    try {
        File dataStream = dataConversion(is);
        s3Client.putObject(doc.getDestinationBucket(),FilenameUtils.getFullPath(doc.getDestinationKey()) + "Modified_"+ fileName, dataStream);
    } catch (AmazonServiceException e) {
        e.printStackTrace();
        log.error(e);
    } catch (SdkClientException e) {
        e.printStackTrace();
        log.error(e);

    }               
}

//Setup reading and writing streams
public static File dataConversion(PipedInputStream stream) {
    BufferedReader reader = null;
    BufferedOutputStream streamOut = null;
    String line;

    try {
        reader = new BufferedReader(new InputStreamReader(stream,doc.getFileFormat()));
        streamOut = new BufferedOutputStream(new FileOutputStream(file));
        meta = new ObjectMetadata();
        while(( line = reader.readLine() ) != null)
        {
            processLine(reader, streamOut, line);
        }
    }
    catch (IOException e) {
        e.printStackTrace();
    } finally {
        try {
            streamOut.close();
            reader.close();

        } catch (IOException e) {
            e.printStackTrace();
            log.error(e);
        }
    }
    return file;
}


/*---------------------------------------Data processing------------------------------------------------*/

    /*-----------Process and print lines---------*/
private static void processLine(BufferedReader reader, BufferedOutputStream streamOut, String line) {
    try {
        String newLine = System.getProperty("line.separator");

        while (reader.ready()) {
            if (line.contains(doc.getInsert())) {
                handleData(streamOut, line);
            } else if (line.contains(doc.getUse())) {
                handleSchemaName(streamOut, line);
            } else {
                streamOut.write(line.toLowerCase().getBytes(Charset.forName(doc.getFileFormat()).toString()));
                streamOut.write(newLine.getBytes());
            }
            line = reader.readLine();
        }
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
        log.error(e);

    } catch (IOException e) {
        e.printStackTrace();
        log.error(e);

    }
}

    /*-----------Replace-Schema-Name-----------*/
private static void handleSchemaName(BufferedOutputStream streamOut, String line) throws IOException {
    line = line.replace(line, "USE " + "`" + doc.getSchemaName() + app.getCurrentPacsId() + "`;");
    streamOut.write(line.getBytes(Charset.forName(doc.getFileFormat())));
}


    /*--------Avoid-Formating-Data-Portion-of-file--------*/
private static void handleData(BufferedOutputStream streamOut, String line) throws IOException {
    StringTokenizer tk = new StringTokenizer(line);
    while (tk.hasMoreTokens()) {
        String data = tk.nextToken();
        if (data.equals(doc.getValue())) {
            streamOut.write(data.toLowerCase().getBytes(Charset.forName(doc.getFileFormat()).toString()));
            data = tk.nextToken();
            while (tk.hasMoreTokens()) {
                streamOut.write(data.getBytes(Charset.forName(doc.getFileFormat())));
                data = tk.nextToken();
            }
        }
        streamOut.write(line.toLowerCase().getBytes(Charset.forName(doc.getFileFormat().toString())));
        streamOut.write(" ".getBytes(Charset.forName(doc.getFileFormat())));
    }
}
  1. Rule 1 is always to use a bigger buffer. 1024 is pitifully small. Try 32-64K.
  2. You need to start the pipe reading thread before doing any writes to the pipe. In fact I'm surprised you don't get 'read end dead' errors. Does this code really work at all?
  3. In fact get rid of the piped streams. Use a single thread and do all the processing as you go.
  4. Get rid of the ready() test. It is an extra system call for nothing. Just read until end of stream.
  5. Use a BufferedWriter instead of a BufferedOutputStream and stop converting all those strings to bytes (and use BufferedWriter.newLine() instead of the system property).

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM