繁体   English   中英

逐个字符读取一个巨大的文本文件需要很长时间

[英]Reading a huge text file character by character taking a long time

我正在逐个字符地使用FileInputStream读取Java中的9 KB文本文件,并且几乎要花一分钟时间才能读取。 这个性能好吗,还是可以通过使用其他一些Stream例如BufferdReader并一次读取整个数据来优化此性能。

// This method is used to read the Brown Corpus
public void readBrownCorpus(String corpusPath) throws IOException {
    FileInputStream inputStream = null;
    try {
        inputStream = new FileInputStream(corpusPath);
        int letter = 0; // denote current read letter
        String previousTag = "^";
        StringBuilder wordWithTag = new StringBuilder(); // denote the string which
        while((letter = inputStream.read()) != -1) {
            if(((char) letter) != ' ')
                wordWithTag.append((char) letter);
            else {
                String word[] = wordWithTag.substring(0).split("_");
                if(word != null && word.length != 2)
                    throw new Exception("Error in the Format of Corpus");
                // If new tag found, insert this in both transitionTable and emissionTable
                if(transitionTable.get(word[1]) == null) {
                    insertTagInTransitionTable(previousTag, word[1]);
                }

                updateTranstionTable(previousTag, word[1]);
                updateEmissionTable(word[0], word[1]);

                // update the previous Tag
                if(word[1].equals("."))
                    previousTag = "^";
                else
                    previousTag = word[1];
                wordWithTag.setLength(0); //empty the wordWithTag for new word
                System.out.println(transitionTable.size());
            }
        }
    } catch(IOException ioException) {
        ioException.printStackTrace();
    } catch(Exception exception) {
        exception.printStackTrace();
    }
    finally {
        inputStream.close();
    }
}

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package demo;
import java.util.*;
import java.io.*;

/**
 *
 * @author Jatin Khurana
 */
public class Main {

    public HashMap<String,Row> transitionTable;  // Transition Table
    public HashMap<String,Row> emissionTable; // Emission Table

    // Constructor
    public Main()
    {
        transitionTable=new HashMap<String,Row>();
        emissionTable=new HashMap<String,Row>();
        prepareInitialTransitionTable();
        //prepareInitialEmissionTable();
    }

    // This method prepare the initial transition Table
    private void prepareInitialTransitionTable()
    {
        Row row1=new Row();
        row1.tagCount.put("^", 0.0f);
        row1.tagCount.put(".", 0f);
                Row row2=new Row();
        row2.tagCount.put("^", 0f);
        row2.tagCount.put(".", 0f);
        transitionTable.put("^", row1);
        transitionTable.put(".", row2);
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException{

        Main m=new Main();

        BufferedReader inputStream=null;
        try
        {
            inputStream=new BufferedReader(new FileReader("d://postagger//corpus//brown.txt"));
            String corpusData = inputStream.readLine();
            String previousTag="^";
                        String wordWithTag[] = corpusData.split(" ");
            for(int i=0;i<wordWithTag.length;i++)
                        {
                                        String word[]=wordWithTag[i].split("_");
                    if(word!=null && word.length!=2)
                        throw new Exception("Error in the Format of Corpus");
                    // If new tag found,insert this in both transitionTable and emissionTable
                    if(m.transitionTable.get(word[1])==null)
                    {
                        m.insertTagInTransitionTable(previousTag,word[1]);
                    }

                    m.updateTranstionTable(previousTag,word[1]);
                    m.updateEmissionTable(word[0],word[1]);

                    // update the previous Tag
                                        if(word[1].equals("."))
                                        {
                                            previousTag="^";
                                        }
                                        else
                                        {
                                            previousTag=word[1];
                                        }
                    System.out.println(m.transitionTable.size());
                }
            }
        catch(IOException ioException)
        {
            ioException.printStackTrace();
        }
        catch(Exception exception)
        {
            exception.printStackTrace();
        }
        finally
        {
            inputStream.close();
        }
    }

    private void insertTagInTransitionTable(String previousTag,String newTag) throws CloneNotSupportedException
    {
            Row row = (Row)transitionTable.get(previousTag);
            row.tagCount.put(newTag,0f);
                        Row newRow=new Row();
            transitionTable.put(newTag, newRow);
    }

    // This method is used to update the transitionTable
    private void updateTranstionTable(String previousTag,String currentTag)
    {
        Row row = transitionTable.get(previousTag);
                if(row.tagCount.get(currentTag)==null)
                {
                    row.tagCount.put(currentTag, 1f);
                }
                else
                {
                    row.tagCount.put(currentTag, row.tagCount.get(currentTag)+1);
                }

    }

    // This method is used to update the emission table
    private void updateEmissionTable(String word,String tag)
    {
                Row row = emissionTable.get(word);
                if(row==null)
                {
                    Row newRow=new Row();
                    newRow.tagCount.put(tag, 1f);
                    emissionTable.put(word, newRow);
                }
                else
                {
                    if(row.tagCount.get(tag)==null)
                    {
                        row.tagCount.put(tag, 1f);
                    }
                    else
                    {
                        row.tagCount.put(tag,row.tagCount.get(tag)+1);
                    }
                }
    }

}

我的老师说我必须在3到5秒钟内做同样的事情。 怎么做?

使用BufferedInputStream包装FileInputStream以获得简单的快速修复。 然后考虑使用readLine()

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM