[英]Read column from a web page table
我希望只能从NASA网页中读取“每日太阳辐射-水平”列。 我该怎么办? 这是我的代码:
# Horizontal radiation values (kwh/m**2)
import urllib.parse
import html5lib
import pandas as pd
url = "https://eosweb.larc.nasa.gov/cgi-bin/sse/retscreen.cgi?email=rets%40nrcan.gc.ca&step=1&lat=49.4&lon=7.3&submit=Submit"
params = {'lat':a,'lon':b}
url_parts = list(urllib.parse.urlparse(url))
query = dict(urllib.parse.parse_qsl(url_parts[4]))
query.update(params)
url_parts[4] = urllib.parse.urlencode(query)
print(urllib.parse.urlunparse(url_parts))
webresult = pd.read_html(urllib.parse.urlunparse(url_parts))
webresult[3]
它仅显示完整表。
使用BeautifulSoup
可以很容易地做到这一点。 在代码注释中给出了解释。
import bs4, requests
def getColumn(url):
# get the page
resp = requests.get(url)
# create a BeautifulSoup object that represents the page
# and use lxml parser to parse the html
soup = bs4.BeautifulSoup(resp.text, 'lxml')
# get all the tables in the page
tables= soup.findAll('table')
# all data of interest will be collected here
data = []
#we only want to process the 4th table, so we store it in table
table = tables[3]
# for each row in this table, get the 4th column and add it in data
for row in table.findAll('tr'):
row_data= row.findAll('td')
if not row_data: continue #skip empty lists
column4= row.findAll('td')[3].string # read the 4th column
data.append(column4)
# data is in string so we need to convert it to float
# discard the first and last two elements in the list (we don't want them)
# then convert the remaining from string to float
data = [ float(x.strip()) for x in data[1:-2]]
return data
def main():
url= 'https://eosweb.larc.nasa.gov/cgi-bin/sse/retscreen.cgi?email=rets%40nrcan.gc.ca&step=1&lat=49.4&lon=7.3&submit=Submit'
lst = getColumn(url)
print(lst)
if __name__ == '__main__':
main()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.