简体   繁体   中英

Parse XML to pandas using elementTree and python

I have the following xml structure:

<GL_MarketDocument
    xmlns="urn:iec62325.351:tc57wg16:451-6:generationloaddocument:3:0">
    <mRID>352539b33d6245f88c0cea8c70c86e76</mRID>
    <revisionNumber>1</revisionNumber>
    <type>A75</type>
    <process.processType>A16</process.processType>
    <sender_MarketParticipant.mRID codingScheme="A01">10X1001A1001A450</sender_MarketParticipant.mRID>
    <sender_MarketParticipant.marketRole.type>A32</sender_MarketParticipant.marketRole.type>
    <receiver_MarketParticipant.mRID codingScheme="A01">10X1001A1001A450</receiver_MarketParticipant.mRID>
    <receiver_MarketParticipant.marketRole.type>A33</receiver_MarketParticipant.marketRole.type>
    <createdDateTime>2023-01-11T11:37:08Z</createdDateTime>
    <time_Period.timeInterval>
        <start>2023-01-10T23:00Z</start>
        <end>2023-01-11T11:00Z</end>
    </time_Period.timeInterval>
    <TimeSeries>
        <mRID>1</mRID>
        <businessType>A01</businessType>
        <objectAggregation>A08</objectAggregation>
        <inBiddingZone_Domain.mRID codingScheme="A01">10Y1001A1001A46L</inBiddingZone_Domain.mRID>
        <quantity_Measure_Unit.name>MAW</quantity_Measure_Unit.name>
        <curveType>A01</curveType>
        <MktPSRType>
            <psrType>B04</psrType>
        </MktPSRType>
        <Period>
            <timeInterval>
                <start>2023-01-10T23:00Z</start>
                <end>2023-01-11T10:00Z</end>
            </timeInterval>
            <resolution>PT60M</resolution>
            <Point>
                <position>1</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>2</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>3</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>4</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>5</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>6</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>7</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>8</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>9</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>10</position>
                <quantity>0</quantity>
            </Point>
            <Point>
                <position>11</position>
                <quantity>0</quantity>
            </Point>
        </Period>
    </TimeSeries>
    <TimeSeries>
        <mRID>2</mRID>
        <businessType>A01</businessType>
        <objectAggregation>A08</objectAggregation>
        <inBiddingZone_Domain.mRID codingScheme="A01">10Y1001A1001A46L</inBiddingZone_Domain.mRID>
        <quantity_Measure_Unit.name>MAW</quantity_Measure_Unit.name>
        <curveType>A01</curveType>
        <MktPSRType>
            <psrType>B12</psrType>
        </MktPSRType>
        <Period>
            <timeInterval>
                <start>2023-01-10T23:00Z</start>
                <end>2023-01-11T10:00Z</end>
            </timeInterval>
            <resolution>PT60M</resolution>
            <Point>
                <position>1</position>
                <quantity>841</quantity>
            </Point>
            <Point>
                <position>2</position>
                <quantity>821</quantity>
            </Point>
            <Point>
                <position>3</position>
                <quantity>809</quantity>
            </Point>
            <Point>
                <position>4</position>
                <quantity>803</quantity>
            </Point>
            <Point>
                <position>5</position>
                <quantity>800</quantity>
            </Point>
            <Point>
                <position>6</position>
                <quantity>799</quantity>
            </Point>
            <Point>
                <position>7</position>
                <quantity>884</quantity>
            </Point>
            <Point>
                <position>8</position>
                <quantity>963</quantity>
            </Point>
            <Point>
                <position>9</position>
                <quantity>1012</quantity>
            </Point>
            <Point>
                <position>10</position>
                <quantity>1021</quantity>
            </Point>
            <Point>
                <position>11</position>
                <quantity>1006</quantity>
            </Point>
        </Period>
    </TimeSeries>

and I am tryng to get this: 在此处输入图像描述

I am able to get the the tags separately using this:

response = requests.get(base_url)
root = ET.fromstring(response.content) #get the xml content as text

#Manage namespaces
text = root.tag #get the namespace from root tag
get_ns = text[text.index('{')+len('{'):text.index('}')] #grab the text between the curly brackets
#Register the name space
ET.register_namespace("", get_ns)
#Save the namespace/S in a dict so we dont have to specify them in the loop
ns = {"": get_ns}

#for child in root.iter(): print(child.tag, child.attrib) #iterate through all the nodes

#find all the tags
psc_type = root.findall(".//TimeSeries/MktPSRType/psrType", ns)
pos = root.findall(".//TimeSeries/Period/Point/position", ns)
qty = root.findall(".//TimeSeries/Period/Point/quantity", ns)


#nitiate a list for rows and define column names for pandas
df_cols = ["Type", "TimeOfDay", "Quantity"]
rows1 = []
rows = []
for psc in psc_type:
    p_type = psc.text
    rows1.append(psc.text)
for hour, qt in zip( pos, qty):
        hour =  hour.text, 
        qty = qt.text
        period = [hour[0], qty]
        #hour comes out as a tuple, so we need to get first value out hour[0]
        rows.append(period)
        x = [rows1, rows]

that returns two lists, that I guess I can put together in pandas:

['B04', 'B12', 'B14', 'B20', 'B16', 'B19']
[['1', '0'], ['2', '0'], ['3', '0'], ['4', '0'], ['5', '0'], ['6', '0'], ['7', '0'], ['8', '0'], ['9', '0'], ['10', '0'], ['11', '0'], ['12', '0'], ['1', '841'], ['2', '821'], ['3', '809'], ['4', '803'], ['5', '800'], ['6', '799'], ['7', '884'], ['8', '963'], ['9', '1012'], ['10', '1021'], ['11', '1006'], ['12', '1011'], ['1', '5793'], ['2', '5794'], ['3', '5795'], ['4', '5794'], ['5', '5794'], ['6', '5794'], ['7', '5794'], ['8', '5795'], ['9', '5792'], ['10', '5790'], ['11', '5791'], ['12', '5794'], ['1', '667'], ['2', '657'], ['3', '651'], ['4', '666'], ['5', '675'], ['6', '706'], ['7', '743'], ['8', '775'], ['9', '784'], ['10', '792'], ['11', '837'], ['12', '856'], ['1', '0'], ['2', '0'], ['3', '0'], ['4', '0'], ['5', '0'], ['6', '0'], ['7', '0'], ['8', '0'], ['9', '0'], ['10', '0'], ['11', '2'], ['12', '3'], ['1', '1984'], ['2', '2164'], ['3', '2310'], ['4', '2497'], ['5', '2669'], ['6', '2786'], ['7', '2884'], ['8', '2927'], ['9', '2913'], ['10', '2873'], ['11', '2813'], ['12', '2740']]

But it seems too complicated. My guess is that ElementTree can parse that and maybe even pandas with the new XML read but I just cant figure it out.

Where am I going wrong?

While it's possible to do what you want using ElementTree, I personally prefer lxml, because of its better xpath support - which is what you need here.

This assumes that response.content is exactly like the xml you have in the question, except with a closing </GL_MarketDocument> tag (which you ommitted in the question) added back.

So all together:

from lxml import etree
import pandas as pd
root = etree.XML(response.content)

rows = []
columns = ['psrTYpe','Position','Quantity']
ns = {'xx': 'urn:iec62325.351:tc57wg16:451-6:generationloaddocument:3:0'}
per = doc.xpath('//xx:Period',namespaces=ns)
for p in per:
    mkt = p.xpath('.//preceding-sibling::xx:MktPSRType/xx:psrType/text()',namespaces=ns)[0]
    pt = p.xpath('.//xx:Point/xx:position/text()',namespaces=ns)
    qn = p.xpath('.//xx:Point/xx:quantity/text()',namespaces=ns)
    for position, quantity in zip(pt,qn):
        rows.append([mkt,position,quantity])
        
df = pd.DataFrame(rows, columns=columns)
df

The output should be your expected output.

I managed to solve it, so in case it helps others:

Basically I needed to loop from the root of both nodes and then append on a list.

root = ET.fromstring(response.content) #get the xml content as text

#Manage namespaces
text = root.tag #get the namespace from root tag
get_ns = text[text.index('{')+len('{'):text.index('}')] #grab the text between the curly brackets
#Register the name space
ET.register_namespace("", get_ns)
#Save the namespace/S in a dict so we dont have to specify them in the loop
ns = {"": get_ns}

#for child in root.iter(): print(child.tag, child.attrib) #iterate through all the nodes

get_values = []
rows = []
for parent in root.findall(".//TimeSeries" , ns):
    psr_type = parent.find("MktPSRType/psrType", ns)
    for pos, qty in zip(parent.findall("Period/Point/position", ns), parent.findall("Period/Point/quantity", ns)):
        position = pos.text
        quantity = qty.text
        get_values = [psr_type.text,position, quantity]
        rows.append(get_values)

xml_todf = pd.DataFrame(rows, columns=['Source','TimeOfDay', 'Quantity'])

print(xml_todf.to_string())

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM