简体   繁体   中英

How to download all rows data from a website using beatifulsoup

I'd like to get some information from the weather side. https://pogoda.interia.pl/archiwum-pogody-08-10-2019,cId,21295

Separately Hour and minutes:

<div class="entry-hour">
        <span><span class="hour">0</span><span class="minutes">00</span></span>
    </div>

Forecast temp:

<span class="forecast-temp">9°C</span>

And FeelTemp:

<span class="forecast-feeltemp">Odczuwalna 4°C </span>

I'm standing still because I don't know how to get all the lines and the rest of the data; ( Thank you in advance for your help...

Below is my pseudo code;)

#!/usr/bin/python3
import pymysql.cursors
from time import sleep, gmtime, strftime
import datetime
import pytz
from selenium import webdriver
from bs4 import BeautifulSoup


options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')



browser = webdriver.Chrome(
        ("/usr/bin/chromedriver"),
        chrome_options=options)

browser.get("https://pogoda.interia.pl/archiwum-pogody-08-10-2019,cId,21295")
sleep(3)
source = browser.page_source # Get the entire page source from the browser
if browser is not None :browser.close() # No need for the browser so close it 
soup = BeautifulSoup(source,'html.parser')
try:
    Tags = soup.select('.weather-forecast-hbh-list') # get the elements using css selectors    
    for tag in Tags: # loop through them 
        hour      = tag.find('div').find('span').text
        #minutes = ?
        #temp =?
        #feel_temp = ?
        print (hour + "\n")

except Exception as e:
    print(e)

One way of doing it would be to loop over all the divs with class weather-entry and then extract the text out of each one, building a table structure along the way.

For example:

import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

page = requests.get('https://pogoda.interia.pl/archiwum-pogody-08-10-2019,cId,21295').content
weather_entries = BeautifulSoup(page, "html.parser").find_all("div", {"class": "weather-entry"})


def extract_text(element, class_name):
    return element.find("div", class_=class_name).getText(strip=True)


div_classes = [
    "entry-hour",
    "entry-forecast",
    "entry-wind",
    "entry-precipitation",
    "entry-humidity",
]

table = [[extract_text(e, c) for c in div_classes] for e in weather_entries]
columns = ["Time:", "Forecast", "Wind", "Precipitation", "Humidity"]
print(tabulate(table, headers=columns, tablefmt="pretty"))

This outputs:

+-------+---------------------------------------+----------------------+---------------+----------+
| Time: |               Forecast                |         Wind         | Precipitation | Humidity |
+-------+---------------------------------------+----------------------+---------------+----------+
|  000  |     -2°COdczuwalna 0°CBezchmurnie     |   S4km/hMax 4 km/h   |               |   97%    |
|  100  |    -2°COdczuwalna -1°CBezchmurnie     |   S4km/hMax 7 km/h   |   Zachm:10%   |   98%    |
|  200  |    -2°COdczuwalna -1°CBezchmurnie     |  SSW4km/hMax 8 km/h  |               |   98%    |
|  300  |    -2°COdczuwalna -1°CBezchmurnie     |   S4km/hMax 7 km/h   |               |   98%    |
|  400  |     -2°COdczuwalna 1°CBezchmurnie     |   N0km/hMax 7 km/h   |               |   93%    |
|  500  |     -2°COdczuwalna 1°CBezchmurnie     |   N0km/hMax 6 km/h   |               |   99%    |
|  600  | -2°COdczuwalna -1°CZachmurzenie duże  |  SSW4km/hMax 6 km/h  |   Zachm:76%   |   92%    |
|  700  |  -1°COdczuwalna 3°CZachmurzenie duże  |   N0km/hMax 7 km/h   |   Zachm:76%   |   84%    |
|  800  |     -3°COdczuwalna -1°CPochmurno      |  SSW4km/hMax 8 km/h  |   Zachm:91%   |   99%    |
|  900  |      3°COdczuwalna 5°CPochmurno       |  SSW4km/hMax 8 km/h  |   Zachm:91%   |   79%    |
| 1000  |      5°COdczuwalna 4°CPochmurno       |  S11km/hMax 11 km/h  |   Zachm:91%   |   71%    |
| 1100  |      6°COdczuwalna 5°CPochmurno       | SSW11km/hMax 20 km/h |  Zachm:100%   |   65%    |
| 1200  |      9°COdczuwalna 7°CPochmurno       |  S15km/hMax 25 km/h  |  Zachm:100%   |   66%    |
| 1300  |   10°COdczuwalna 8°CPrzelotne opady   |  S15km/hMax 25 km/h  |  Zachm:100%   |   60%    |
| 1400  |      11°COdczuwalna 8°CPochmurno      |  S18km/hMax 24 km/h  |  Zachm:100%   |   55%    |
| 1500  |      10°COdczuwalna 6°CPochmurno      |  S22km/hMax 27 km/h  |   Zachm:91%   |   57%    |
| 1600  |      10°COdczuwalna 6°CPochmurno      |  S22km/hMax 31 km/h  |   Zachm:91%   |   60%    |
| 1700  |   12°COdczuwalna 8°CPrzelotne opady   |  S18km/hMax 32 km/h  |  Zachm:100%   |   53%    |
| 1800  | 9°COdczuwalna 4°CCzęściowo słonecznie |  S18km/hMax 33 km/h  |   Zachm:50%   |   66%    |
| 1900  |      8°COdczuwalna 4°CPochmurno       |  S15km/hMax 31 km/h  |  Zachm:100%   |   82%    |
| 2000  |      8°COdczuwalna 4°CPochmurno       |  S18km/hMax 22 km/h  |   Zachm:91%   |   82%    |
| 2100  |   9°COdczuwalna 5°CPrzelotne opady    | SSW18km/hMax 22 km/h |  Zachm:100%   |   78%    |
| 2200  |      8°COdczuwalna 4°CPochmurno       | SSW15km/hMax 28 km/h |  Zachm:100%   |   80%    |
| 2300  |   8°COdczuwalna 5°CPrzelotne opady    | SSW11km/hMax 25 km/h |   Zachm:91%   |   81%    |
+-------+---------------------------------------+----------------------+---------------+----------+

Obviously, you need to do some parsing on the text values a bit, but that should get you started.

Thanks my friend, I already understood it;) I have to get all the items first and return them in a loop;)

#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup

page = requests.get('https://pogoda.interia.pl/archiwum-pogody-08-10-2019,cId,21295').content
weather_entries = BeautifulSoup(page, "html.parser").find_all("div", {"class": "weather-entry"})
for weather_entrie in weather_entries:
    hour = weather_entrie.find('span', {'class' : 'hour'}).text
    minutes = weather_entrie.find('span', {'class' : 'minutes'}).text
    temp = weather_entrie.find('span', {'class' : 'forecast-temp'}).text
    tempFeel = weather_entrie.find('span', {'class' : 'forecast-feeltemp'}).text
    print(hour + ":" + minutes + " \t " + temp + " \t " + tempFeel)
    

I dont have much experience with BeatifulSoup but same can be achieved with selenium web scraping itself using xpath. Below code can be used to extract the details required.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

browser = webdriver.Chrome(
        ("/usr/bin/chromedriver"),
        chrome_options=options)

browser.get("https://pogoda.interia.pl/archiwum-pogody-08-10-2019,cId,21295")
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, "//div[@class='entry-hour']")))
weather_entry = browser.find_elements_by_xpath("//div[@class='weather-entry']")
for w in weather_entry:
    hour = w.find_element_by_xpath(".//div[@class='entry-hour']/span/span[@class='hour']").text
    temp = w.find_element_by_xpath(".//div[@class='entry-forecast']/div//span[@class='temp-info']/span[@class='forecast-temp']").text
    feeltemp = w.find_element_by_xpath(".//div[@class='entry-forecast']/div//span[@class='temp-info']/span[@class='forecast-feeltemp']").text
    print('hour '+ hour + ' temp ' + temp + ' feeltemp ' + feeltemp)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM