从网页表中读取列

Question

我希望只能从NASA网页中读取“每日太阳辐射-水平”列。 我该怎么办？ 这是我的代码：

# Horizontal radiation values (kwh/m**2)
import urllib.parse
import html5lib
import pandas as pd

url = "https://eosweb.larc.nasa.gov/cgi-bin/sse/retscreen.cgi?email=rets%40nrcan.gc.ca&step=1&lat=49.4&lon=7.3&submit=Submit"

params = {'lat':a,'lon':b}

url_parts = list(urllib.parse.urlparse(url))

query = dict(urllib.parse.parse_qsl(url_parts[4]))

query.update(params)

url_parts[4] = urllib.parse.urlencode(query)

print(urllib.parse.urlunparse(url_parts))

webresult = pd.read_html(urllib.parse.urlunparse(url_parts))
webresult[3]

它仅显示完整表。

Answer 1

使用BeautifulSoup可以很容易地做到这一点。 在代码注释中给出了解释。

import bs4, requests

def getColumn(url):
    # get the page
    resp = requests.get(url)

    # create a BeautifulSoup object that represents the page
    # and use lxml parser to parse the html
    soup = bs4.BeautifulSoup(resp.text, 'lxml')

    # get all the tables in the page
    tables= soup.findAll('table')

    # all data of interest will be collected here
    data = []

    #we only want to process the 4th table, so we store it in table
    table = tables[3]

    # for each row in this table, get the 4th column and add it in data
    for row in table.findAll('tr'):
        row_data= row.findAll('td')

        if not row_data: continue    #skip empty lists

        column4= row.findAll('td')[3].string    # read the 4th column

        data.append(column4)

    # data is in string so we need to convert it to float

    # discard the first and last two elements in the list (we don't want them)
    # then convert the remaining from string to float
    data = [ float(x.strip()) for x in data[1:-2]]

    return data


def main():
    url= 'https://eosweb.larc.nasa.gov/cgi-bin/sse/retscreen.cgi?email=rets%40nrcan.gc.ca&step=1&lat=49.4&lon=7.3&submit=Submit'
    lst = getColumn(url)

    print(lst)

if __name__ == '__main__':
    main()

从网页表中读取列

问题描述

1 个解决方案

解决方案1
2 已采纳 2017-09-07 15:01:06

从网页表中读取列

问题描述

1 个解决方案

解决方案1 2 已采纳 2017-09-07 15:01:06

解决方案1
2 已采纳 2017-09-07 15:01:06