[英]How to read/parse an .xls file in Python (XML schema)
如何在 python 中读取此结构?
<?xml version="1.0" encoding="ISO-8859-1"?>
-<Workbook xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:msxsl="urn:schemas-microsoft-com:xslt" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:schemas-microsoft-com:office:spreadsheet">
-<Styles>
-<Style ss:ID="VIEW">
<Font ss:Bold="1"/>
</Style>
-<Style ss:ID="HEADER">
<Font ss:Bold="1" ss:Color="#FFFFFF"/>
<Interior ss:Color="#666699" ss:Pattern="Solid"/>
</Style>
-<Style ss:ID="DOUBLE_0">
<NumberFormat ss:Format="0"/>
</Style>
-<Style ss:ID="DOUBLE_2">
<NumberFormat ss:Format="0.00"/>
</Style>
-<Style ss:ID="DOUBLE_3">
<NumberFormat ss:Format="0.000"/>
</Style>
-<Style ss:ID="DOUBLE_4">
<NumberFormat ss:Format="0.0000"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_2">
<NumberFormat ss:Format="0.00%"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_3">
<NumberFormat ss:Format="0.000%"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_4">
<NumberFormat ss:Format="0.0000%"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_5">
<NumberFormat ss:Format="0.00000%"/>
</Style>
-<Style ss:ID="DATE">
<NumberFormat ss:Format="yyyy\-mm\-dd;@"/>
</Style>
<ss:Style ss:ID="STRING"/>
</Styles>
-<Worksheet ss:Name="MSFRE">
-<Table x:FullRows="1" x:FullColumns="1">
-<Row>
-<Cell ss:StyleID="VIEW" ss:Index="1">
<Data ss:Type="String">Geo</Data>
</Cell>
</Row>
-<Row>
<Cell ss:StyleID="HEADER"/>
<Cell ss:StyleID="HEADER"/>
-<Cell ss:StyleID="HEADER" ss:Index="3" ss:MergeAcross="5">
<Data ss:Type="String"/>
</Cell>
</Row>
-<Row>
<Cell ss:StyleID="HEADER"/>
<Cell ss:StyleID="HEADER"/>
-<Cell ss:StyleID="HEADER" ss:Index="3">
<Data ss:Type="String">Holding Date</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="4">
<Data ss:Type="String">Fund code</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="5">
<Data ss:Type="String">Fund name</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="6">
<Data ss:Type="String">PTF</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="7">
<Data ss:Type="String">REF</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="8">
<Data ss:Type="String">PTF-REF</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="1">
<Data ss:Type="String">Geo</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.18545250736645816</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.18545250736645816</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Asia ex-Japan</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">9.356235001537855E-4</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">9.356235001537855E-4</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">EMU</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.10654090959320628</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.10654090959320628</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Emerging Countries</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.00294017805163712</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.00294017805163712</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Europe ex-EMU</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.02354783768818136</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.02354783768818136</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Japan</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.005898729959204227</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.005898729959204227</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">North America</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.044037874185699856</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.044037874185699856</Data>
</Cell>
</Row>
</Table>
</Worksheet>
</Workbook>
如果我使用此代码,单元格索引会崩溃并且无法重新创建表:
import pandas as pd
import xml.etree.cElementTree as ET
#dict_bd_area =
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet"}
tree = ET.parse("filepath")
root = tree.getroot()
def getvalueofnode(node):
""" return node text or None """
return node.text if node is not None else None
def main():
""" main """
parsed_xml = tree
data = []
for i, node in enumerate(root.findall('.//doc:Row', ns)):
if i > 3:
data.append({'Geo': getvalueofnode(node.find('doc:Cell[1]/doc:Data', ns)),
'Holding - Date': getvalueofnode(node.find('doc:Cell[2]/doc:Data', ns)),
'Fund Code': getvalueofnode(node.find('doc:Cell[3]/doc:Data', ns)),
'Name': getvalueofnode(node.find('doc:Cell[4]/doc:Data', ns)),
'PTF': getvalueofnode(node.find('doc:Cell[5]/doc:Data', ns)),
'REF': getvalueofnode(node.find('doc:Cell[6]/doc:Data', ns)),
'PTF-REF':getvalueofnode(node.find('doc:Cell[7]/doc:Data', ns)),
'SPS PTF': getvalueofnode(node.find('doc:Cell[8]/doc:Data', ns)),
'SPS REF': getvalueofnode(node.find('doc:Cell[9]/doc:Data', ns))
})
返回(pd.DataFrame(数据))
如果单元格索引为空或 null 在这种情况下如何处理? (在这种情况下,单元格索引 7 并不总是存在)。 在这种情况下,Win32 库无法帮助我,我尝试过,但它没有处理数千个文件所需的性能
更新:2
如果我想解析 parentcolDimension 怎么办? 这是 XML 文件的开头:
<loadingDate>2021-08-10T00:00:00+02:00</loadingDate>
<exportDate>2021-08-11T18:43:22.513+02:00</exportDate>
<exportParameters key="occurrenceKey" value="Family_occurrence"/>
<exportParameters key="occurrenceName" value="Family occurrence"/>
<exportParameters key="ConfigurationPath" value="/usr/users/reporting/configurations"/>
<exportParameters key="scenarioKey" value="mult"/>
<exportParameters key="perspective" value="Mulnt"/>
<dataViews viewHashcode="-75742857" showRootLine="true">
<viewKey>Flat</viewKey>
<viewName>Flat</viewName>
<viewType>tableSectionViewType</viewType>
<sectionInfos>
<sectionInfo sectionKey="MultiView" showSectionLine="false" startLine="0"/>
</sectionInfos>
<viewId>3</viewId>
<headers>
<header id="1" key="GF 350006|" desc="<html><font color=#00A0E3><b>Weight% (PTF) groupName="GF 350006" Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350006"/>
</header>
<header id="2" key="GF TOP PRIVATE 350007 groupName="GF 350007"Key="WEIGHT#PTF" dataType="Number" dataFormat="PERCENT_FIXED_2">
<parentColDimension label="GF 350007"/>
尝试使用“索引”属性而不是单元格元素索引:
# add "ss" namespace declaration to the namespaces map
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet", "ss": "urn:schemas-microsoft-com:office:spreadsheet"}
# in function call reference element "Cell" having an attribute "Index" with value "7"
getvalueofnode(node.find('doc:Cell[@ss:Index="7"]/doc:Data', ns))
同样的方法也可以用于其他细胞。
此代码将尝试查找具有给定索引属性的单元格元素。 如果没有找到 function getvalueofnode() 将返回 None。
要获取parentColDimension
,可以使用以下代码:
for parentColDimension in root.findall('.//doc:header/doc:parentColDimension', ns):
print(parentColDimension.get('label'))
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.