简体   繁体   中英

Apache POI Streaming API doesn't recognize Excel (xlsx) content

I have a class which ingests .xlsx-files. I took it from this example and modified it for my needs: https://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java Now the application processes some files just fine, others not at all. If I change one single field or even character in one of the not working files and save them again, the whole content is processed correctly. Does anyone have an idea what might be the reason for (imho it lies somewhere within the original excel files).

To whom it may help, here is my code:

    package com.goodgamestudios.icosphere.service.fileReader;

    import com.goodgamestudios.icosphere.datamodel.DataSet;
    import com.goodgamestudios.icosphere.datamodel.Tuple;
    import java.io.File;
    import java.io.IOException;
    import java.io.InputStream;
    import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.ss.usermodel.BuiltinFormats;
    import org.apache.poi.ss.usermodel.DataFormatter;
    import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
    import org.apache.poi.xssf.eventusermodel.XSSFReader;
    import org.apache.poi.xssf.model.SharedStringsTable;
    import org.apache.poi.xssf.model.StylesTable;
    import org.apache.poi.xssf.usermodel.XSSFCellStyle;
    import org.apache.poi.xssf.usermodel.XSSFRichTextString;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.xml.sax.Attributes;
    import org.xml.sax.InputSource;
    import org.xml.sax.SAXException;
    import org.xml.sax.XMLReader;
     import org.xml.sax.helpers.DefaultHandler;
    import org.xml.sax.helpers.XMLReaderFactory;


    public class ExcelFileReader implements FileReader {

    static final Logger LOG = LoggerFactory.getLogger(ExcelFileReader.class);
    private SheetHandler handler;

    @Override
    public DataSet getDataFromFile(File file) throws IOException {

        LOG.info("Start ingesting file {}");
        try {
            OPCPackage pkg = OPCPackage.open(file);
            XSSFReader reader = new XSSFReader(pkg);
            StylesTable styles = reader.getStylesTable();
            ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);

            SharedStringsTable sst = reader.getSharedStringsTable();
            XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
            handler = new SheetHandler(styles, strings, 24);
            parser.setContentHandler(handler);

            // rId2 found by processing the Workbook
            // Seems to either be rId# or rSheet#
            System.out.println("yooooo 1");
            InputStream sheet2 = reader.getSheet("rId2");
            System.out.println("yooooo 2");
            InputSource sheetSource = new InputSource(sheet2);
            System.out.println("yooooo 3");
            parser.parse(sheetSource);
            LOG.debug("{} rows parsed", handler.getData().getRows().size() + 1);
            sheet2.close();
            return handler.getData();

        } catch (OpenXML4JException | SAXException ex) {
            LOG.warn("Unable to parse file {}", file.getName());
            LOG.warn("Exception: {}: ", ex);
        }

        return null;
    }

    /**
     * See org.xml.sax.helpers.DefaultHandler javadocs
     *
     * Derived from http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
     * <p/>
     * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
     * http://www.ecma-international.org/publications/standards/Ecma-376.htm
     * <p/>
     * A web-friendly version is http://openiso.org/Ecma/376/Part4
     */
    private static class SheetHandler extends DefaultHandler {

        boolean isFirstRow = true;
        private int quantityOfColumns;
        private int currentColumnNumber = 1;
        int currentRowNumber = 1;
        private int rowNumberOfLastCell = 1;
        private DataSet data = new DataSet();
        private Tuple tuple;

        /**
         * Table with styles
         */
        private StylesTable stylesTable;

        /**
         * Table with unique strings
         */
        private ReadOnlySharedStringsTable sharedStringsTable;

        /**
         * Number of columns to read starting with leftmost
         */
        private final int minColumnCount;

        // Set when V start element is seen
        private boolean vIsOpen;

        // Set when cell start element is seen;
        // used when cell close element is seen.
        private xssfDataType nextDataType;

        // Used to format numeric cell values.
        private short formatIndex;
        private String formatString;
        private final DataFormatter formatter;

        // The last column printed to the output stream
        private int lastColumnNumber = -1;

        // Gathers characters as they are seen.
        private StringBuffer value;

        static final Logger LOG = LoggerFactory.getLogger(SheetHandler.class);

        private SheetHandler(StylesTable styles,
                ReadOnlySharedStringsTable strings,
                int cols) {
            this.stylesTable = styles;
            this.sharedStringsTable = strings;
            this.minColumnCount = cols;
            this.value = new StringBuffer();
            this.nextDataType = xssfDataType.NUMBER;
            this.formatter = new DataFormatter();
            LOG.debug("Sheethandler created");
        }

        /*
         * (non-Javadoc)
         * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
         */
        public void startElement(String uri, String localName, String name,
                Attributes attributes) throws SAXException {
            System.out.println("yooooooooooo start:uri:" + uri + " localname: " + localName + " name: " + name);
            if ("inlineStr".equals(name) || "v".equals(name)) {
                vIsOpen = true;
                // Clear contents cache
                value.setLength(0);
            } // c => cell
            else if ("c".equals(name)) {
                // Get the cell reference
                String r = attributes.getValue("r");
                int firstDigit = -1;
                for (int c = 0; c < r.length(); ++c) {
                    if (Character.isDigit(r.charAt(c))) {
                        firstDigit = c;
                        break;
                    }
                }
                currentColumnNumber = nameToColumn(r.substring(0, firstDigit));
                System.out.println("colu mn " + currentColumnNumber);

                // Set up defaults.
                this.nextDataType = xssfDataType.NUMBER;
                this.formatIndex = -1;
                this.formatString = null;
                String cellType = attributes.getValue("t");
                String cellStyleStr = attributes.getValue("s");
                if ("b".equals(cellType)) {
                    nextDataType = xssfDataType.BOOL;
                } else if ("e".equals(cellType)) {
                    nextDataType = xssfDataType.ERROR;
                } else if ("inlineStr".equals(cellType)) {
                    nextDataType = xssfDataType.INLINESTR;
                } else if ("s".equals(cellType)) {
                    nextDataType = xssfDataType.SSTINDEX;
                } else if ("str".equals(cellType)) {
                    nextDataType = xssfDataType.FORMULA;
                } else if (cellStyleStr != null) {
                    // It's a number, but almost certainly one
                    //  with a special style or format 
                    XSSFCellStyle style = null;
                    if (cellStyleStr != null) {
                        int styleIndex = Integer.parseInt(cellStyleStr);
                        style = stylesTable.getStyleAt(styleIndex);
                    } else if (stylesTable.getNumCellStyles() > 0) {
                        style = stylesTable.getStyleAt(0);
                    }
                    if (style != null) {
                        this.formatIndex = style.getDataFormat();
                        this.formatString = style.getDataFormatString();
                        if (this.formatString == null) {
                            this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
                        }
                    }
                }
            }

        }

        /*
         * (non-Javadoc)
         * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
         */
        public void endElement(String uri, String localName, String name)
                throws SAXException {

            String thisStr = null;

            // v => contents of a cell
            if ("v".equals(name)) {
                // Process the value contents as required.
                // Do now, as characters() may be called more than once
                switch (nextDataType) {

                    case BOOL:
                        char first = value.charAt(0);
                        thisStr = first == '0' ? "FALSE" : "TRUE";
                        break;

                    case ERROR:
                        thisStr = "\"ERROR:" + value.toString() + '"';
                        break;

                    case FORMULA:
                        // A formula could result in a string value,
                        // so always add double-quote characters.
                        thisStr = '"' + value.toString() + '"';
                        break;

                    case INLINESTR:
                        // TODO: have seen an example of this, so it's untested.
                        XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
                        thisStr = '"' + rtsi.toString() + '"';
                        break;

                    case SSTINDEX:
                        String sstIndex = value.toString();
                        try {
                            int idx = Integer.parseInt(sstIndex);
                            XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
                            thisStr = rtss.toString();
                        } catch (NumberFormatException ex) {
                            System.out.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
                        }
                        break;

                    case NUMBER:
                        String n = value.toString();
                        if (this.formatString != null && n.length() > 0) {
                            thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
                        } else {
                            thisStr = n;
                        }
                        break;

                    default:
                        thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
                        break;
                }

                // Output after we've seen the string contents
                // Emit commas for any fields that were missing on this row
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
                for (int i = lastColumnNumber; i < currentColumnNumber; ++i) {
                }

                // Might be the empty string.
                System.out.println(thisStr);
                if (isFirstRow) {
                    data.getHeaders().add(thisStr);
                } else {
                    tuple.getRowEntries()[currentColumnNumber] = thisStr;
                }
                // Update column
                if (currentColumnNumber > -1) {
                    lastColumnNumber = currentColumnNumber;
                }

            } else if ("row".equals(name)) {

                // We're onto a new row
                System.out.println("nextrow");
                lastColumnNumber = -1;
                System.out.println("yoooooo tuple:" + tuple);
                if (isFirstRow) {
                    isFirstRow = false;
                    quantityOfColumns = data.getHeaders().size();
                    tuple = new Tuple(quantityOfColumns);

                } else if (!tuple.isEmpty()) {
                    data.addRow(tuple);
                    tuple = new Tuple(quantityOfColumns);
                }
            }

        }

        /**
         * Captures characters only if a suitable element is open. Originally
         * was just "v"; extended for inlineStr also.
         */
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            if (vIsOpen) {
                value.append(ch, start, length);
            }
        }

        /**
         * Converts an Excel column name like "C" to a zero-based index.
         *
         * @param name
         * @return Index corresponding to the specified name
         */
        private int nameToColumn(String name) {
            int column = -1;
            for (int i = 0; i < name.length(); ++i) {
                int c = name.charAt(i);
                column = (column + 1) * 26 + c - 'A';
            }
            return column;
        }

        public DataSet getData() {
            return data;
        }
    }

    /**
     * The type of the data value is indicated by an attribute on the cell. The
     * value is usually in a "v" element within the cell.
     */
    enum xssfDataType {

        BOOL,
        ERROR,
        FORMULA,
        INLINESTR,
        SSTINDEX,
        NUMBER,
    }
}

Here is the xml example of a working and a not working worksheet:

http://www.file-upload.net/download-10909789/not_working.xml.html http://www.file-upload.net/download-10909790/working.xml.html

and here the xlsx-files:

http://www.file-upload.net/download-10909802/not_working.xlsx.html http://www.file-upload.net/download-10909803/working.xlsx.html

Thanks!

The problem was, that LibreOffice Calc saves the first worksheet under "rId2", whereas MSOffice does so under "rId1". So now I'm now going through sheetIds until a sheet with content is parsed or no more sheets are found. Works with both files:

private void parseFirstWorksheetWithContent(XSSFReader reader) throws IOException, InvalidFormatException, SAXException {
    //Sheet-ID seems to differ, seems to be "rId2" for files saved by MS Excel and "rId1" for those saved by LibreOffice Calc
    try {
        for (int i = 1; handler.getData().isEmpty(); i++) {
            parseSheet(reader, "rId" + i);
        }
    } catch (IllegalArgumentException e) {
        //No more sheets, file empty
    }
}

private void parseSheet(XSSFReader reader, String sheetId) throws InvalidFormatException, SAXException, IOException {
        XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
        parser.setContentHandler(handler);
        InputStream sheetStream = reader.getSheet(sheetId);
        InputSource sheetSource = new InputSource(sheetStream);
        parser.parse(sheetSource);
        sheetStream.close();
    }

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM