簡體   English   中英

使用 Beautiful soup 從 XML 文件中提取數據幀

[英]Extracting data Frame from XML file using Beautiful soup

我試圖使用 Beautiful Soup 將 xml 表轉換為數據框。

import bs4 as bs
import urllib.request
import pandas as pd
source = urllib.request.urlopen("http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml").read()
soup = bs.BeautifulSoup(source,'xml')

GName = soup.find_all('GeneratorName')
Ftype = soup.find_all('FuelType')
Hour = soup.find_all('Hour')
Mwatt = soup.find_all('EnergyMW')

data = []
for i in range(0,len(GName)):
   rows = [GName[i].get_text(),Ftype[i].get_text(),
           Hour [i].get_text(),Mwatt[i].get_text()
           ]
   data.append(rows)
df = pd.DataFrame(data,columns = ['Generator Name','Fuel Type',
                                  'Hour','Energy MW'],
                                   dtype = int)
display(df)
    Generator Name  Fuel Type   Hour    Energy MW
0   BRUCEA-G1   NUCLEAR     1   777
1   BRUCEA-G2   NUCLEAR     2   777
2   BRUCEA-G3   NUCLEAR     3   777
3   BRUCEA-G4   NUCLEAR     4   778
4   BRUCEB-G5   NUCLEAR     5   780
...     ...     ...     ...     ...
175     STONE MILLS SF  SOLAR   8   0
176     WINDSOR AIRPORT SF  SOLAR   9   0
177     ATIKOKAN-G1     BIOFUEL     10  0
178     CALSTOCKGS  BIOFUEL     11  0
179     TBAYBOWATER CTS     BIOFUEL     12  0

180 rows × 4 columns

最終數據框僅給出索引 0 的能量 MW。 它應該適用於所有 180 個站。 我卡住了。 謝謝

這是一個非常粗糙的 xml 你在那里需要一些努力才能轉換成一個看起來像頁面上的表格:

#first, some required imports
import itertools
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

url = 'http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml'

req = requests.get(url)
soup = bs(req.text,'lxml')

gens = soup.select('Generator')
#We have to get the names of the 180 generators and double it to have 2 rows each:
gen_names = [gen.select_one('generatorname').text for gen in gens]
gen_names = list(itertools.chain.from_iterable(itertools.repeat(x, 2) for x in gen_names))

# we also need to create a list 180 Capability and Output pairs and flatten it to 360:
vars = list(itertools.chain.from_iterable(itertools.repeat(x, 180) for x in [["Capability","Output"]]))
vars = list(itertools.chain(*vars))

#all that in order to create a MultiIndex dataframe:
index = pd.MultiIndex.from_arrays([gen_names,vars], names=["Generator", "Hours"])

#create column names equal to the hours - note that, depending on the time of day the data is downloaded there could be more or less columns
cols = list(range(1,20))

#now collect the data; there may be shorter ways to do that, but for I used a longer method, for easier readability
data = []
for gen in gens:
    row = []
    row.append([g.text for g in gen.select('Capability  energymw')])
    row.append([g.text for g in gen.select('Output energymw')])
    data.extend(row)
pd.DataFrame(data,index=index,columns=cols)

Output(請原諒格式):

                        1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18  19
Generator   Hours                                                                           
BRUCEA-G1   Capability  795     795     795     795     795     795     795     795     795     795     795     795     795     795     795     795     795     795     795
Output  776     775     775     774     775     775     774     774     774     775     776     777     777     775     774     773     773     774     774
BRUCEA-G2   Capability  779     779     779     779     779     779     779     779     779     779     779     779     779     779     779     779     779     779     779

ETC。

一種方法可能是使用指示的轉換文檔和etree.XSLT來應用生成表的轉換。 Select 該表,帶有pandas read_html ,然后根據需要對標題進行一些修飾。

from lxml import etree
from pandas import read_html as rh

transform = etree.XSLT(etree.parse('http://reports.ieso.ca/docrefs/stylesheet/GenOutputCapability_HTML_t1-4.xsl'))
result_tree = transform(etree.parse('http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml'))
df = rh(str(result_tree), match = 'Hours')[0]
df.columns = df.iloc[1, :] 
df = df.iloc[2:, ]
df

在此處閱讀有關轉換步驟的信息: https://lxml.de/xpathxslt.html#xslt

from lxml.html import parse
import pandas as pd


def main(url):
    data = parse(url).find('.//generators')
    allin = []
    for i in data:
        allin.append({
            'GeneratorName': i[0].text,
            'FuelType': i[1].text,
            'Outputs': [x.text for x in i[2].cssselect('EnergyMW')],
            'Capabilities': [x.text for x in i[3].cssselect('EnergyMW')],
            'capacities': [x.text for x in i[4].cssselect('EnergyMW')]
        })
    df = pd.DataFrame(allin)
    print(df)


main('http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml')

Output:

          GeneratorName  ...                                         capacities
0             BRUCEA-G1  ...  [795, 795, 795, 795, 795, 795, 795, 795, 795, ...
1             BRUCEA-G2  ...  [779, 779, 779, 779, 779, 779, 779, 779, 779, ...
2             BRUCEA-G3  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3             BRUCEA-G4  ...  [760, 760, 760, 760, 760, 760, 760, 760, 760, ...
4             BRUCEB-G5  ...  [817, 817, 817, 817, 817, 817, 817, 817, 817, ...
..                  ...  ...                                                ...
175      STONE MILLS SF  ...  [54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5...
176  WINDSOR AIRPORT SF  ...  [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 5...
177         ATIKOKAN-G1  ...  [215, 215, 215, 215, 215, 215, 215, 215, 215, ...
178          CALSTOCKGS  ...  [38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
179     TBAYBOWATER CTS  ...  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...

[180 rows x 5 columns]

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM