簡體   English   中英

Java代碼未處理巨大的CSV文件

[英]Java code is not processing a huge CSV file

我是Java的新手,我必須編寫代碼來處理巨大的CSV文件,更具體地說,是旋轉文件並以其他方式輸出。

當我測試幾百行代碼時,它可以完美運行! 但是,當我制作大約70萬行的文件時,就是這樣,它輸出的文件沒有任何值!

這是我正在使用的文件模式:

row_number,Time_float,V1_float,V2_float,V3_float,V4_float,V5_float,V6_float,V7_float,V8_float,V9_float,V10_float,V11_float,V12_float,V13_float,V14_float,V15_float,V16_float,V17_float,V18_float,V19_float,V20_float,V21_float,V22_float,V23_float,V24_float,V25_float,V26_float,V27_float,V28_float,Amount_float,Class_float
0,-1.996583023457193,-0.6942423209592997,-0.04407492457044802,1.6727734992241514,0.973365514375461,-0.24511658354252203,0.34706794516190337,0.19367893831762997,0.0826372794108694,0.33112778320993075,0.08338554524255039,-0.540407035731003,-0.6182957177945313,-0.9960989219768981,-0.32461018632700356,1.6040138389062168,-0.5368328685192513,0.24486345402090126,0.030769932602201018,0.4962820266510568,0.3261180160164485,-0.024923364961491876,0.38285443833968436,-0.17691133433749112,0.1105069205607409,0.2465854429694212,-0.3921704315485548,0.3308916226487169,-0.06378115069750527,0.24496426337017338,-0.04159897836869265
1,-1.996583023457193,0.608496327692869,0.16117591988327584,0.10979710210602639,0.31652292674812704,0.043483352047263765,-0.06181996595114916,-0.06370020974401087,0.07125348305960227,-0.23249418894056792,-0.15334962861450452,1.5800028495181635,1.0660885709726662,0.49141820388511015,-0.14998248182742244,0.6943604186631746,0.5294337538197206,-0.13516996909847068,-0.21876258231230966,-0.17908604930753447,-0.08961086263099204,-0.3073768045456866,-0.880076754448717,0.16220118362512442,-0.5611305498033158,0.3206939009070385,0.2610694754212664,-0.02225567818703569,0.04460751768012477,-0.3424745411051305,-0.04159897836869265
2,-1.9965619655334634,-0.6935004627168448,-0.8115778263099092,1.1694684928233277,0.2682312938426663,-0.3645717858377013,1.3514535859514658,0.6397756379029846,0.20737272949225669,-1.3786753514283783,0.19069961380161685,0.6118297100478469,0.06613661868011089,0.720699852326139,-0.173113888845362,2.5629061849805295,-3.298235372241396,1.3068678794417397,-0.14478999149835625,-2.7785608505208725,0.6809749715218957,0.33763169617058825,1.0633582711230263,1.456319745720107,-1.1380921384645095,-0.6285367205194723,-0.2884467520153837,-0.1371368556920797,-0.18102082710565728,1.1606859252297228,-0.04159897836869265
3,-1.9965619655334634,-0.4933248981469483,-0.11216942463929835,1.1825164508596404,-0.6097266412236745,-0.007468880343610824,0.9361498321757143,0.19207063819763437,0.3160175995113439,-1.2625031722059106,-0.050467953146436886,-0.22189161429591525,0.17837098770941834,0.5101687012441243,-0.30036049360793543,-0.6898374093803747,-1.2092959931757417,-0.8054446421603216,2.3453045221536306,-1.5142049233314772,-0.26985522543866364,-0.1474432966746349,0.007266907401368897,-0.30477654737727394,-1.941027139612268,1.2419037126390184,-0.46021734156146177,0.15539620725948264,0.1861885865396321,0.14053425198451397,-0.04159897836869265
4,-1.9965409076097336,-0.5913297637052182,0.5315410497431424,1.0214116755556233,0.28465540421364943,-0.29501543610441505,0.07199858317395955,0.4793022833609244,-0.22651023121331143,0.7443262870821311,0.6916250322008002,-0.8061465859382944,0.5386266478753359,1.3522443515849518,-1.1680335140679103,0.1913234721415302,-0.5152051215648076,-0.2790807862169683,-0.04556900286007717,0.987037297022314,0.5299387935696488,-0.012839217648641888,1.1000112712256314,-0.2201233960065819,0.2332500879381676,-0.39520164174981287,1.041611299621563,0.5436197963831321,0.6518159160992683,-0.07340334025310606,-0.04159897836869265
5,-1.9965409076097336,-0.21747462336443635,0.5816748868716723,0.7525853776136525,-0.11883332826086528,0.30500895924490323,-0.02231347965260122,0.3849359380956985,0.21795466774987732,-0.5176185728033812,-0.34110109646946285,1.3140464540936228,0.360182114067614,-0.3597915734090633,-0.1430571341620137,0.5655071130706558,0.45845966002880245,-0.06844505898436722,0.08190792006276612,-0.04077665181354113,0.1102154145899529,-0.2835222214491732,-0.7714270019345486,-0.04227284666287563,-0.6132733979473761,-0.4465835771366699,0.21963714625507125,0.6289004813773338,0.24563620173852233,-0.33855641663507124,-0.04159897836869265
6,-1.996498791762274,0.6277951843506749,0.08538910070753938,0.029922970138274736,0.8493831061038468,0.139019583041878,0.20469452783029024,-0.004170268076019058,0.0679975594125977,0.4232179413095267,-0.0911553627796218,-1.3881568003685008,-0.15394904162357412,-0.7546302429818224,0.174601525806113,0.05478291827131152,-0.5062323708749048,0.0033208458038780472,-0.7301429071537743,-0.05598631103904616,-0.28489532870127926,-0.22833366503144684,-0.373032409163239,-0.24677959261431537,-1.2879725270568725,1.4390366584060272,-0.5334361005147672,0.08549234948827854,0.015655983019391424,-0.33327894285907317,-0.04159897836869265
7,-1.9964356179910847,-0.3289283471917919,0.8586923440261,0.708576234767151,-0.34763105227477603,0.6875116870747386,0.32134542115371234,0.9058597727667212,-3.1882293041144427,0.5601291069522123,1.1474295272213637,-0.6068981847284727,0.2917078257188488,1.7663144960411647,-1.381049034972912,0.7496140133583983,-0.08687803964000758,-1.4389216521167638,-0.42738292340326745,0.39863532675456365,-0.20331695379069067,2.6458886485220905,-1.3992756084115,0.0920853274818958,-1.0727537219105157,-0.7966329656594401,-0.1070748551085608,-2.9901537037868864,-3.2880827705218882,-0.19010747625415497,-0.04159897836869265
8,-1.9964356179910847,-0.45657301681652984,0.17329147453401414,-0.07465262179597631,-0.19177387759003722,1.9341493665366207,2.793594054309211,0.29920595987227944,0.7125916765850463,-0.35685128708200337,-0.3769401127987114,-0.690809076979848,-0.11054073462883784,-0.2876133294193531,0.07756710959787069,-0.3592023119205175,-0.23974544333000244,-0.5884222737443606,0.14169463193440693,0.7006152251433341,0.06840582416474023,-0.09996300675392492,-0.36942472981431596,-0.327055266660248,1.6702690915973604,0.7159429034669824,-0.7966330619330197,0.02910414108912445,0.43142017601489174,0.019392240308193916,-0.04159897836869265
9,-1.9963935021436252,-0.17269774387883696,0.6780049203798014,0.6887814922128964,-0.15692675912027115,0.36179160642264196,-0.1852187109108031,0.5267057814810946,0.05822291589340413,-0.6705871938144513,-0.33691175301635523,0.996966068530707,0.837059523154456,1.0116259929700517,-0.46268060306897046,0.1641174886473323,0.8438820412040243,-0.6369448535966505,0.568708693313514,0.5549770030986336,0.2642433515365529,-0.336155613099156,-0.8732980464093,-0.19343791270246957,-0.6357673003859946,-0.1337734549071511,0.195341606203293,0.6100096992904638,0.25168133078356686,-0.33851643577313184,-0.04159897836869265
10,-1.9963724442198953,0.7398016180995844,-0.7123689085952064,0.6027095930866821,-0.9716078839795883,-1.4282856664438242,-0.4722411589208851,-1.150469214075524,0.04057090066824084,-1.5659577306254295,1.493926862783695,1.1753019928647082,-0.6719776055792052,-0.5163883874999073,-0.09915047715944188,0.2522962688421534,0.03648207425930615,0.29836818807428966,1.0192908179449345,-0.27193462986307226,-0.5022889927521785,-0.012663864860947654,0.43254001227028344,0.0444226909424958,0.826410614996712,0.4822144153958534,-0.26850047275423083,0.10616079403669354,0.04923997135326903,-0.32204432065410754,-0.04159897836869265

這是代碼:

import com.opencsv.CSVReader;
import com.opencsv.CSVWriter;
import java.io.FileWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

public class test {

    public static void main(String[] args) {
        try {
            //csv file containing data
            CSVReader reader = new CSVReader(new FileReader("/Users/csv-parser/sentiment.csv"));
            CSVWriter writer = new CSVWriter(new FileWriter("/Users/csv-parser/Output.csv"));
            String [] nextLine;
            String [] Target_Columns = new String[] {"row_number", "column_name", "column_value_string", "column_value_float"};
            List<String> Columns = null;
            long start = System.currentTimeMillis();

            int lineNumber = 0;

            while (true) {
                if ((nextLine = reader.readNext()) == null) break;
                lineNumber++;
                if (lineNumber == 1) {
                    writer.writeNext( Target_Columns );
                    Columns = Arrays.asList( nextLine );
//                    System.out.println( Arrays.toString( nextLine ) );
                    continue;
                }
                System.out.println( Arrays.toString( nextLine ) );
                int indexOfRow = Columns.indexOf( "row_number" );
                String row_value = "{ROW_NUMBER},{COl_NAME},{COL_STR},{COL_FL}";
                int i = 1;
                for(String column: Columns){
                    if (column.equals("row_number")) continue;
                    row_value = row_value.replace( "{ROW_NUMBER}", nextLine[0]);

                    if (column.contains( "_string" )) {
//                        System.out.print( "col_str: " + finalNextLines[i] + " " );
                        row_value = row_value.replace( "{COL_STR}", nextLine[i] );
                        row_value = row_value.replace( "{COL_FL}", " " );
                        row_value = row_value.replace( "{COl_NAME}", column.replace( "_string", "" ) );
                    }
                    if (column.contains( "_float" )) {
//                        System.out.print( "col_fl: " + finalNextLines[i] + " " );
                        row_value = row_value.replace( "{COL_STR}", " " );
                        row_value = row_value.replace( "{COL_FL}",  nextLine[i] );
                        row_value = row_value.replace( "{COl_NAME}", column.replace( "_float", "" ) );
                    }


                    i++;
//                    System.out.println( "ROW: " + row_value + " " );
                    writer.writeNext( row_value.split( "," ) );
                    row_value = "{ROW_NUMBER},{COl_NAME},{COL_STR},{COL_FL}";
                }

//                System.out.println("\n");
            }
            writer.close();
            long finish = System.currentTimeMillis();
            System.out.println( "Time elapsed: " + (finish - start));

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

還有,是否有一種方法可以使代碼同時讀取或處理文件或使用多線程處理文件,我的意思是使其效率更高?

更新:

這是我處理原始文件時的輸出結果 產量

在這些非常大的文件中,偶爾會收到錯誤的輸入。 檢查無用輸入的最佳時間是在讀取時,在記錄/行級別(對於CSV文件,自然選擇行)。

因此,在迭代器循環中放置一個“ if”語句,使它對您認為是錯誤輸入的任何錯誤的錯誤代碼使程序崩潰。 誰知道,您可能會在大量完全填充的行之間找到400個空白行。 另外,您可能會找到特殊的控制字符等。

一旦您感到輸入正確,就可以開始調試樞軸的邏輯。 誰知道,也許您在庫中發現了一個錯誤。

就我個人而言,我認為這是輸入數據的問題,因為您表示使用較小的數據集時邏輯運行良好。 奇怪的是,由於數據量大,您可能沒有注意到較大的數據集。

如果沒有完整的堆棧跟蹤,很難說出問題所在,因為您的代碼在性能,JVM內存使用等方面存在許多潛在的問題。我認為為您提供一種正確的解決方案比較容易; 至少可以處理700k文本文件。

import com.opencsv.CSVReader;
import com.opencsv.CSVWriter;

import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;

public class ReadJSONExample {

    public static void main(String[] args) throws IOException {
        // Move time calculation out of main logic
        long start = System.currentTimeMillis();

        new CsvDataModifier(Paths.get("e:/data.csv"), Paths.get("e:/res.csv")).apply();

        long finish = System.currentTimeMillis();
        System.out.printf("Time elapsed: %d ms", finish - start);
    }
}

final class CsvDataModifier {

    private final Path src;
    private final Path dest;

    public CsvDataModifier(Path src, Path dest) {
        this.src = src;
        this.dest = dest;
    }

    private static final int ROW_NUMBER = 0;
    private static final int COLUMN_NAME = 1;
    private static final int COLUMN_VALUE_STRING = 2;
    private static final int COLUMN_VALUE_FLOAT = 3;
    private static final String[] COLUMN_NAMES = { "row_number", "column_name", "column_value_string", "column_value_float" };

    public void apply() throws IOException {
        // Use `try with recources` to close streams correctly
        try (CSVReader reader = new CSVReader(new FileReader(src.toFile())); 
             CSVWriter writer = new CSVWriter(new FileWriter(dest.toFile()), ',', CSVWriter.NO_QUOTE_CHARACTER)) {
            // key - ordered list of columns in source file
            Map<String, Marker> columnNameFloatMarker = getSourceColumnNamesWithFloatMarker(reader.readNext());
            int posRowNumber = getRowNumberPosition(columnNameFloatMarker.keySet());

            if (columnNameFloatMarker.isEmpty())
                return;

            writer.writeNext(COLUMN_NAMES);

            // Create buffer only once for all lines; do not use string concatenation or replacing
            String[] buf = new String[COLUMN_NAMES.length];

            reader.forEach(values -> {
                buf[ROW_NUMBER] = values[posRowNumber];

                int col = 0;
                // this is just reference to buf[]; if `null` then no output
                String[] resultLine;

                for (Map.Entry<String, Marker> entry : columnNameFloatMarker.entrySet()) {
                    String columnName = entry.getKey();
                    Marker marker = entry.getValue();

                    if ((resultLine = marker.createResultLine(columnName, values[col], buf)) != null)
                        writer.writeNext(resultLine);

                    col++;
                }
            });
        }
    }

    private static final String FLOAT = "_float";
    private static final String STRING = "_string";

    private enum Marker {
        NONE {
            @Override
            public String[] createResultLine(String columnName, String value, String[] buf) {
                return null;
            }
        },
        STRING {
            @Override
            public String[] createResultLine(String columnName, String value, String[] buf) {
                buf[COLUMN_VALUE_STRING] = value;
                buf[COLUMN_VALUE_FLOAT] = " ";
                buf[COLUMN_NAME] = columnName;
                return buf;
            }
        },
        FLOAT {
            @Override
            public String[] createResultLine(String columnName, String value, String[] buf) {
                buf[COLUMN_VALUE_STRING] = " ";
                buf[COLUMN_VALUE_FLOAT] = value;
                buf[COLUMN_NAME] = columnName;
                return buf;
            }
        };

        public abstract String[] createResultLine(String columnName, String value, String[] buf);
    }

    // Source column preprocessing to avoid string comparision; do it only once
    private static Map<String, Marker> getSourceColumnNamesWithFloatMarker(String... columns) {
        if (columns == null || columns.length == 0)
            return Collections.emptyMap();

        Map<String, Marker> map = new LinkedHashMap<>();

        for (int i = 0; i < columns.length; i++) {
            String columnName = columns[i];
            Marker marker = Marker.NONE;

            if (columnName.endsWith(FLOAT)) {
                columnName = columnName.substring(0, columnName.length() - FLOAT.length());
                marker = Marker.FLOAT;
            } else if (columnName.endsWith(STRING)) {
                columnName = columnName.substring(0, columnName.length() - STRING.length());
                marker = Marker.STRING;
            }

            if (map.put(columnName, marker) != null)
                throw new IllegalArgumentException("Column duplication in the source file");
        }

        return map;
    }

    private static int getRowNumberPosition(Set<String> columnNames) {
        int i = 0;

        for (String columnName : columnNames) {
            if ("row_number".equals(columnName))
                return i;
            i++;
        }

        throw new IllegalArgumentException("Source file does not contain 'row_number' column");
    }
}

  • 輸入文件: data.csv〜432 Mb
  • 輸出文件: res.csv〜868 Mb
  • 控制台輸出:經過的時間:21058 ms

您的輸出文件大於輸入文件。 當代碼針對70萬行運行時,可能會生成一個巨大的文件,編輯者很難打開該文件。

當您將其運行700k行時,/ Users / csv-parser / Output.csv的大小是多少?

您是否嘗試使用代碼而不是使用編輯器打開/Users/csv-parser/Output.csv的內容?

您是否嘗試將其放在另一個線程/任務中並運行它? 也許它阻塞了主線程...

看看這個: https//winterbe.com/posts/2015/04/07/java8-concurrency-tutorial-thread-executor-examples/

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM