简体   繁体   中英

Attempting to Parse an XLS (XML) File Using Python

I have an "XLS" file that I downloaded from Netsuite ERP. The file root says ".XLS" but it's actually an XML file. I have a pandas script that will combine several XLS or XLSX files but pandas can't seem to handle this weird XLS/XML file type, so I have another script attempting to parse the XML data and save to XLS or XLSX. The below script doesn't seem to work however, as it results in "None". Could anyone point me in the right direction, either with my sample code, new code, or a new method for solving this odd XLS/XML parsing issue?

Thank you in advance!

XML Sample Code:

<?xml version="1.0" encoding="utf-16"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:html="http://www.w3.org/TR/REC-html40">
  <DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
    <Author>NetSuite Reports</Author>
    <LastAuthor>NetSuite Reports</LastAuthor>
    <Company>NetSuite</Company>
  </DocumentProperties>
  <Styles>
    <Style ss:ID="company">
      <Alignment ss:Horizontal="Center" />
      <Font ss:Size="12" ss:Bold="1" />
    </Style>
    <Style ss:ID="subcompany">
      <Alignment ss:Horizontal="Center" />
      <Font ss:Size="14" ss:Bold="1" />
    </Style>
    <Style ss:ID="error">
      <Alignment ss:Horizontal="Center" />
      <Interior ss:Color="#f0d0d0" ss:Pattern="Solid" />
      <Font ss:Bold="1" />
    </Style>
    <Style ss:ID="header_l">
      <Alignment ss:Horizontal="Left" />
      <Font ss:Size="7" ss:Bold="1" />
      <Interior ss:Color="#d0d0d0" ss:Pattern="Solid" />
    </Style>
    <Style ss:ID="header_r">
      <Alignment ss:Horizontal="Right" />
      <Font ss:Size="7" ss:Bold="1" />
      <Interior ss:Color="#d0d0d0" ss:Pattern="Solid" />
    </Style>
    <Style ss:ID="header_c">
      <Alignment ss:Horizontal="Center" />
      <Font ss:Size="7" ss:Bold="1" />
      <Interior ss:Color="#d0d0d0" ss:Pattern="Solid" />
    </Style>
    <Style ss:ID="scheckbox">
      <Alignment ss:Vertical="Center" ss:Horizontal="Center" />
    </Style>
    <Style ss:ID="Default" ss:Name="Normal">
      <Alignment ss:Vertical="Bottom" />
      <Borders />
      <Font ss:FontName="Arial" ss:Size="8" />
      <Interior />
      <NumberFormat />
      <Protection />
    </Style>
    <Style ss:ID="s53">
      <Alignment ss:Vertical="Center" ss:Horizontal="Left" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s52">
      <Alignment ss:Horizontal="Left" ss:Indent="1" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="0" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s51">
      <Alignment ss:Vertical="Center" ss:Horizontal="Right" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="0" ss:Italic="0" />
      <NumberFormat ss:Format="&quot;€&quot;#,##0.00" />
      <Borders />
    </Style>
    <Style ss:ID="s50">
      <Alignment ss:Vertical="Center" ss:Horizontal="Left" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s58">
      <Alignment ss:Horizontal="Left" ss:Indent="2" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s54">
      <Alignment ss:Vertical="Center" ss:Horizontal="Right" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <NumberFormat ss:Format="&quot;€&quot;#,##0.00" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s59">
      <Alignment ss:Horizontal="Left" ss:Indent="1" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s56">
      <Alignment ss:Horizontal="Left" ss:Indent="2" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s57">
      <Alignment ss:Horizontal="Left" ss:Indent="3" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="0" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s55">
      <Alignment ss:Horizontal="Left" ss:Indent="1" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s60">
      <Alignment ss:Vertical="Center" ss:Horizontal="Left" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
  </Styles>
  <Worksheet ss:Name="TrialBalance">
    <Table>
      <Row>
        <Cell ss:StyleID="company" ss:MergeAcross="1">
          <Data ss:Type="String">Parent Company</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="company" ss:MergeAcross="1">
          <Data ss:Type="String">Company Holdings Inc. : Company A  B.V.</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String">Trial Balance</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String">End of Feb 2020</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String" />
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String" />
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="header_l">
          <Data ss:Type="String">Account</Data>
        </Cell>
        <Cell ss:StyleID="header_r" ss:MergeDown="0" ss:Index="2">
          <Data ss:Type="String">Total</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="s50">
          <Data ss:Type="String">10000 - CASH &amp; CASH EQUIVALENTS</Data>
        </Cell>
        <Cell ss:StyleID="s51" />
      </Row>
      <Row>
        <Cell ss:StyleID="s52">
          <Data ss:Type="String">10101 - Bank - 9999 - Company A - EUR</Data>
        </Cell>
        <Cell ss:StyleID="s51">
          <Data ss:Type="Number">1234567.01</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="s53">
          <Data ss:Type="String">Total - 10000 - CASH &amp; CASH EQUIVALENTS</Data>
        </Cell>
        <Cell ss:Formula="SUM(R[-1]C)" ss:StyleID="s54">
          <Data ss:Type="Number">1234567.01</Data>
        </Cell>
      </Row>
    </Table>
  </Worksheet>
</Workbook>

Python Code Parsing XML to XLS:

import pandas as pd
import xml.etree.cElementTree as ET

tree = ET.parse(r"C:\Users\NAME\Documents\rootfolder\examplefile.xls")
root = tree.getroot()

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None


def main():
    """ main """
    parsed_xml = tree
    dfcols = ['account', 'total']
    df_xml = pd.DataFrame(columns=dfcols)


for node in parsed_xml.getroot():
    account = node.attrib.get('Type="String"')
    total = node.find('Type="Number"')

    df_xml = df_xml.append(
        pd.Series([account, getvalueofnode(total)], index=dfcols),
        ignore_index=True)

print(df_xml)


main()

Python Parsing XML File Results:

  account total
0    None  None

Avoid building a data frame by appending objects like Series or even DataFrames. Instead, build a list of dictionaries to be binded to DataFrame . Additionally, because your XML has a default namespace, you must assign a prefix to parse any element under the namespace

import pandas as pd
import xml.etree.cElementTree as ET

ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet"}

tree = ET.parse(r"C:\Path\To\Input.xml")
root = tree.getroot()

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None


def main():
    """ main """
    parsed_xml = tree

    data = []
    for i, node in enumerate(root.findall('.//doc:Row', ns)):
        if i > 6:
            data.append({'account': getvalueofnode(node.find('doc:Cell[1]/doc:Data', ns)),
                         'total': getvalueofnode(node.find('doc:Cell[2]/doc:Data', ns))})

    return(pd.DataFrame(data))

output_df = main()

print(output_df)
#                                    account       total
# 0          10000 - CASH & CASH EQUIVALENTS        None
# 1    10101 - Bank - 9999 - Company A - EUR  1234567.01
# 2  Total - 10000 - CASH & CASH EQUIVALENTS  1234567.01

Alternatively, save your Excel styled XML as xlsx with Workbook.SaveAs method using win32com (only for Windows users) and read in with pandas.read_excel skipping appropriate rows.

import win32com.client
import pandas as pd

# SAVE EXCEL FILE
try:
    xlApp = win32com.client.Dispatch("Excel.Application")
    xlWbk = xlApp.Workbooks.Open(r"C:\Path\To\Input.xml")
    xlWbk.SaveAs(r"C:\Path\To\Output.xlsx", 51)

    xlWbk.Close(True)
    xlApp.Quit()

except Exception as e:
    print(e)

finally:
    xlWbk = None; xlApp = None
    del xlWbk; del xlApp

# READ EXCEL FILE
output_df = pd.read_excel(r"C:\Path\To\Output.xlsx", skiprows = 6)

print(output_df)    
#                                    Account       Total
# 0          10000 - CASH & CASH EQUIVALENTS         NaN
# 1    10101 - Bank - 9999 - Company A - EUR  1234567.01
# 2  Total - 10000 - CASH & CASH EQUIVALENTS  1234567.01

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM