简体   繁体   中英

Web-Scraping looping multible dynamic Pages using Python and BeautifulSoup

I want to web scrape predefined links. I want to scrape hiking tours from https://www.outdooractive.com/de/ , in a specific area so i defined the area by 20 links. So far so good. I get the data for one link but when i try to loop it through the List of Pages it just goes through one Link. I hope its just my incompetence in logical thinking. If somebody could help me i would be very happy.

Here is my code. With just three links not all of them.

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs


webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.54301,48.94731')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.5443,48.88763")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.4589,48.93163")


for Page in webliste:

        class Page(QWebEnginePage):
            def __init__(self, url):
                self.app = QApplication(sys.argv)
                QWebEnginePage.__init__(self)
                self.html = ''
                self.loadFinished.connect(self._on_load_finished)
                self.load(QUrl(url))
                self.app.exec_()

            def _on_load_finished(self):
                self.html = self.toHtml(self.Callable)
                print('Load finished')

            def Callable(self, html_str):
                self.html = html_str
                self.app.quit()



page = Page(webliste[0+1])   

filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")


headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";"  + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";"  + "Anbieter\n"

f.write(headers)



def main():

    soup = bs.BeautifulSoup(page.html, 'html.parser')

    containers = soup.findAll("div", {"class":"oax_dp_snippet"})

    print ("Anzahl der gefundenen touren", len(containers))
#loop

    for container in containers:

        tour_container = container.findAll("span",{"dir":"auto"})
        cont = tour_container[0].text
        print("Name der Tour: ", cont)
        tour_name = cont

        tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
        leange = tour_data[0].text.strip()
        zeit = tour_data[1].text.strip()
        aufstieg = tour_data[2].text.strip()
        abstieg = tour_data[3].text.strip()

        print("Länge der Tour: ", leange)
        print("Länge der Tour: ", zeit)
        print("Länge der Tour: ", aufstieg)
        print("Länge der Tour: ", abstieg)

        link = container.a["href"]
        link_a = link
        print ("Link zur Tour: ", link)

        tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
        anbieter = tour_anbieter[0].text.strip()
        print("Tourenanbieter: ", anbieter)
        f.write(tour_name + ";" + leange + ";" + zeit + ";"  + aufstieg + ";" + abstieg + ";" + link+ ";"  + anbieter+ "\n")

    f.close()



if __name__ == '__main__': main()

@Steve Haigh Thanks the second tip you gave me was the best. Know everything works. I know its not very sexy programmed but it works ;)

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs


class Page(QWebEnginePage):

    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()

webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.32663,49.07201')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.30002,49.0945")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.22097,49.11664")




filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")


headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";"  + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";"  + "Anbieter\n"

f.write(headers)

def main():

    for i in range(3):

        page = Page(webliste[i])
        soup = bs.BeautifulSoup(page.html, 'html.parser')
        containers = soup.findAll("div", {"class":"oax_dp_snippet"})
        print ("Anzahl der gefundenen touren", len(containers))

        for container in containers:
            tour_container = container.findAll("span",{"dir":"auto"})
            cont = tour_container[0].text
            print("Name der Tour: ", cont)
            tour_name = cont


            tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
            leange = tour_data[0].text.strip()
            zeit = tour_data[1].text.strip()
            aufstieg = tour_data[2].text.strip()
            abstieg = tour_data[3].text.strip()

            print("Länge der Tour: ", leange)       
            print("Länge der Tour: ", zeit)
            print("Länge der Tour: ", aufstieg)
            print("Länge der Tour: ", abstieg)

            link = container.a["href"]
            link_a = link
            print ("Link zur Tour: ", link)

            tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
            anbieter = tour_anbieter[0].text.strip()
            print("Tourenanbieter: ", anbieter)



            f.write(tour_name + ";" + leange + ";" + zeit + ";"  + aufstieg + ";" + abstieg + ";" + link+ ";"  + anbieter+ "\n")











    f.close()



if __name__ == '__main__': main()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM