![](/img/trans.png)
[英]Extracting data from xml with similar tag name using beautiful soup
[英]Extracting data Frame from XML file using Beautiful soup
我試圖使用 Beautiful Soup 將 xml 表轉換為數據框。
import bs4 as bs
import urllib.request
import pandas as pd
source = urllib.request.urlopen("http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml").read()
soup = bs.BeautifulSoup(source,'xml')
GName = soup.find_all('GeneratorName')
Ftype = soup.find_all('FuelType')
Hour = soup.find_all('Hour')
Mwatt = soup.find_all('EnergyMW')
data = []
for i in range(0,len(GName)):
rows = [GName[i].get_text(),Ftype[i].get_text(),
Hour [i].get_text(),Mwatt[i].get_text()
]
data.append(rows)
df = pd.DataFrame(data,columns = ['Generator Name','Fuel Type',
'Hour','Energy MW'],
dtype = int)
display(df)
Generator Name Fuel Type Hour Energy MW
0 BRUCEA-G1 NUCLEAR 1 777
1 BRUCEA-G2 NUCLEAR 2 777
2 BRUCEA-G3 NUCLEAR 3 777
3 BRUCEA-G4 NUCLEAR 4 778
4 BRUCEB-G5 NUCLEAR 5 780
... ... ... ... ...
175 STONE MILLS SF SOLAR 8 0
176 WINDSOR AIRPORT SF SOLAR 9 0
177 ATIKOKAN-G1 BIOFUEL 10 0
178 CALSTOCKGS BIOFUEL 11 0
179 TBAYBOWATER CTS BIOFUEL 12 0
180 rows × 4 columns
最終數據框僅給出索引 0 的能量 MW。 它應該適用於所有 180 個站。 我卡住了。 謝謝
這是一個非常粗糙的 xml 你在那里需要一些努力才能轉換成一個看起來像頁面上的表格:
#first, some required imports
import itertools
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = 'http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml'
req = requests.get(url)
soup = bs(req.text,'lxml')
gens = soup.select('Generator')
#We have to get the names of the 180 generators and double it to have 2 rows each:
gen_names = [gen.select_one('generatorname').text for gen in gens]
gen_names = list(itertools.chain.from_iterable(itertools.repeat(x, 2) for x in gen_names))
# we also need to create a list 180 Capability and Output pairs and flatten it to 360:
vars = list(itertools.chain.from_iterable(itertools.repeat(x, 180) for x in [["Capability","Output"]]))
vars = list(itertools.chain(*vars))
#all that in order to create a MultiIndex dataframe:
index = pd.MultiIndex.from_arrays([gen_names,vars], names=["Generator", "Hours"])
#create column names equal to the hours - note that, depending on the time of day the data is downloaded there could be more or less columns
cols = list(range(1,20))
#now collect the data; there may be shorter ways to do that, but for I used a longer method, for easier readability
data = []
for gen in gens:
row = []
row.append([g.text for g in gen.select('Capability energymw')])
row.append([g.text for g in gen.select('Output energymw')])
data.extend(row)
pd.DataFrame(data,index=index,columns=cols)
Output(請原諒格式):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
Generator Hours
BRUCEA-G1 Capability 795 795 795 795 795 795 795 795 795 795 795 795 795 795 795 795 795 795 795
Output 776 775 775 774 775 775 774 774 774 775 776 777 777 775 774 773 773 774 774
BRUCEA-G2 Capability 779 779 779 779 779 779 779 779 779 779 779 779 779 779 779 779 779 779 779
ETC。
一種方法可能是使用指示的轉換文檔和etree.XSLT
來應用生成表的轉換。 Select 該表,帶有pandas
read_html
,然后根據需要對標題進行一些修飾。
from lxml import etree
from pandas import read_html as rh
transform = etree.XSLT(etree.parse('http://reports.ieso.ca/docrefs/stylesheet/GenOutputCapability_HTML_t1-4.xsl'))
result_tree = transform(etree.parse('http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml'))
df = rh(str(result_tree), match = 'Hours')[0]
df.columns = df.iloc[1, :]
df = df.iloc[2:, ]
df
在此處閱讀有關轉換步驟的信息: https://lxml.de/xpathxslt.html#xslt
from lxml.html import parse
import pandas as pd
def main(url):
data = parse(url).find('.//generators')
allin = []
for i in data:
allin.append({
'GeneratorName': i[0].text,
'FuelType': i[1].text,
'Outputs': [x.text for x in i[2].cssselect('EnergyMW')],
'Capabilities': [x.text for x in i[3].cssselect('EnergyMW')],
'capacities': [x.text for x in i[4].cssselect('EnergyMW')]
})
df = pd.DataFrame(allin)
print(df)
main('http://reports.ieso.ca/public/GenOutputCapability/PUB_GenOutputCapability.xml')
Output:
GeneratorName ... capacities
0 BRUCEA-G1 ... [795, 795, 795, 795, 795, 795, 795, 795, 795, ...
1 BRUCEA-G2 ... [779, 779, 779, 779, 779, 779, 779, 779, 779, ...
2 BRUCEA-G3 ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3 BRUCEA-G4 ... [760, 760, 760, 760, 760, 760, 760, 760, 760, ...
4 BRUCEB-G5 ... [817, 817, 817, 817, 817, 817, 817, 817, 817, ...
.. ... ... ...
175 STONE MILLS SF ... [54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5...
176 WINDSOR AIRPORT SF ... [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 5...
177 ATIKOKAN-G1 ... [215, 215, 215, 215, 215, 215, 215, 215, 215, ...
178 CALSTOCKGS ... [38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
179 TBAYBOWATER CTS ... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...
[180 rows x 5 columns]
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.