简体   繁体   中英

How to read/parse an .xls file in Python (XML schema)

how to read this structure in python?

<?xml version="1.0" encoding="ISO-8859-1"?>

-<Workbook xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:msxsl="urn:schemas-microsoft-com:xslt" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:schemas-microsoft-com:office:spreadsheet">


-<Styles>


-<Style ss:ID="VIEW">

<Font ss:Bold="1"/>

</Style>


-<Style ss:ID="HEADER">

<Font ss:Bold="1" ss:Color="#FFFFFF"/>

<Interior ss:Color="#666699" ss:Pattern="Solid"/>

</Style>


-<Style ss:ID="DOUBLE_0">

<NumberFormat ss:Format="0"/>

</Style>


-<Style ss:ID="DOUBLE_2">

<NumberFormat ss:Format="0.00"/>

</Style>


-<Style ss:ID="DOUBLE_3">

<NumberFormat ss:Format="0.000"/>

</Style>


-<Style ss:ID="DOUBLE_4">

<NumberFormat ss:Format="0.0000"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_2">

<NumberFormat ss:Format="0.00%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_3">

<NumberFormat ss:Format="0.000%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_4">

<NumberFormat ss:Format="0.0000%"/>

</Style>


-<Style ss:ID="PERCENT_FIXED_5">

<NumberFormat ss:Format="0.00000%"/>

</Style>


-<Style ss:ID="DATE">

<NumberFormat ss:Format="yyyy\-mm\-dd;@"/>

</Style>

<ss:Style ss:ID="STRING"/>

</Styles>


-<Worksheet ss:Name="MSFRE">


-<Table x:FullRows="1" x:FullColumns="1">


-<Row>


-<Cell ss:StyleID="VIEW" ss:Index="1">

<Data ss:Type="String">Geo</Data>

</Cell>

</Row>


-<Row>

<Cell ss:StyleID="HEADER"/>

<Cell ss:StyleID="HEADER"/>


-<Cell ss:StyleID="HEADER" ss:Index="3" ss:MergeAcross="5">

<Data ss:Type="String"/>

</Cell>

</Row>


-<Row>

<Cell ss:StyleID="HEADER"/>

<Cell ss:StyleID="HEADER"/>


-<Cell ss:StyleID="HEADER" ss:Index="3">

<Data ss:Type="String">Holding Date</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="4">

<Data ss:Type="String">Fund code</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="5">

<Data ss:Type="String">Fund name</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="6">

<Data ss:Type="String">PTF</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="7">

<Data ss:Type="String">REF</Data>

</Cell>


-<Cell ss:StyleID="HEADER" ss:Index="8">

<Data ss:Type="String">PTF-REF</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="1">

<Data ss:Type="String">Geo</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.18545250736645816</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.18545250736645816</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Asia ex-Japan</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">9.356235001537855E-4</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">9.356235001537855E-4</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">EMU</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.10654090959320628</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.10654090959320628</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Emerging Countries</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.00294017805163712</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.00294017805163712</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Europe ex-EMU</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.02354783768818136</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.02354783768818136</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">Japan</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.005898729959204227</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.005898729959204227</Data>

</Cell>

</Row>


-<Row>


-<Cell ss:Index="2">

<Data ss:Type="String">North America</Data>

</Cell>


-<Cell ss:StyleID="DATE" ss:Index="3">

<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>

</Cell>


-<Cell ss:Index="4">

<Data ss:Type="String">PF39594</Data>

</Cell>


-<Cell ss:Index="5">

<Data ss:Type="String">MSFRE</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">

<Data ss:Type="Number">0.044037874185699856</Data>

</Cell>


-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">

<Data ss:Type="Number">0.044037874185699856</Data>

</Cell>

</Row>


</Table>

</Worksheet>

</Workbook>

if I use this code the cell index collapse and can't recreate the table:

import pandas as pd
import xml.etree.cElementTree as ET

#dict_bd_area = 
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet"}

tree = ET.parse("filepath")
root = tree.getroot()

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None


def main():
    """ main """
    parsed_xml = tree

    data = []
    for i, node in enumerate(root.findall('.//doc:Row', ns)):
        if i > 3:
            data.append({'Geo': getvalueofnode(node.find('doc:Cell[1]/doc:Data', ns)),
                         'Holding - Date': getvalueofnode(node.find('doc:Cell[2]/doc:Data', ns)),
                         'Fund Code': getvalueofnode(node.find('doc:Cell[3]/doc:Data', ns)),
                         'Name': getvalueofnode(node.find('doc:Cell[4]/doc:Data', ns)), 
                         'PTF': getvalueofnode(node.find('doc:Cell[5]/doc:Data', ns)),
                         'REF': getvalueofnode(node.find('doc:Cell[6]/doc:Data', ns)),
                         'PTF-REF':getvalueofnode(node.find('doc:Cell[7]/doc:Data', ns)),
                         'SPS PTF': getvalueofnode(node.find('doc:Cell[8]/doc:Data', ns)),
                         'SPS REF': getvalueofnode(node.find('doc:Cell[9]/doc:Data', ns))
                        })

return(pd.DataFrame(data))

How to handle the cell index if is empty or null as in this case? (in this case cell index 7 doesn't always exists). The Win32 library cannot help me in this case, I tried but it didn't have performance required to handle thousands of file

UPDATE: 2

What if I want to parse the parentcolDimension? This is the beginning of the XML file:

<loadingDate>2021-08-10T00:00:00+02:00</loadingDate>
<exportDate>2021-08-11T18:43:22.513+02:00</exportDate>
<exportParameters key="occurrenceKey" value="Family_occurrence"/>
<exportParameters key="occurrenceName" value="Family occurrence"/>
<exportParameters key="ConfigurationPath" value="/usr/users/reporting/configurations"/>
<exportParameters key="scenarioKey" value="mult"/>
<exportParameters key="perspective" value="Mulnt"/>
<dataViews viewHashcode="-75742857" showRootLine="true">
<viewKey>Flat</viewKey>
<viewName>Flat</viewName>
<viewType>tableSectionViewType</viewType>
<sectionInfos>
<sectionInfo sectionKey="MultiView" showSectionLine="false" startLine="0"/>
</sectionInfos>
<viewId>3</viewId>
<headers>
<header id="1" key="GF 350006|" desc="<html><font color=#00A0E3><b>Weight% (PTF) groupName="GF 350006" Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350006"/>
</header>
<header id="2" key="GF TOP PRIVATE 350007 groupName="GF 350007"Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350007"/>

Try to use "Index" attribute instead of cell element index:

# add "ss" namespace declaration to the namespaces map
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet", "ss": "urn:schemas-microsoft-com:office:spreadsheet"}

# in function call reference element "Cell" having an attribute "Index" with value "7"
getvalueofnode(node.find('doc:Cell[@ss:Index="7"]/doc:Data', ns))

Same approach could be used for other cells as well.

This code will try to find cell element with the given index attribute. If not found function getvalueofnode() will return None.

To get parentColDimension following code could be used:

for parentColDimension in root.findall('.//doc:header/doc:parentColDimension', ns):
  print(parentColDimension.get('label'))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM