繁体   English   中英

使用Python和BeautifulSoup进行网络抓取循环多个动态页面

[英]Web-Scraping looping multible dynamic Pages using Python and BeautifulSoup

我想从网上抓取预定义的链接。 我想从https://www.outdooractive.com/de/抓取特定区域的远足旅行,所以我通过20个链接定义了该区域。 到现在为止还挺好。 我得到一个链接的数据,但是当我尝试通过页面列表循环它时,它只是通过一个链接。 我希望这只是我在逻辑思维上的无能。 如果有人可以帮助我,我将非常高兴。

这是我的代码。 仅具有三个链接,并非全部。

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs


webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.54301,48.94731')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.5443,48.88763")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.4589,48.93163")


for Page in webliste:

        class Page(QWebEnginePage):
            def __init__(self, url):
                self.app = QApplication(sys.argv)
                QWebEnginePage.__init__(self)
                self.html = ''
                self.loadFinished.connect(self._on_load_finished)
                self.load(QUrl(url))
                self.app.exec_()

            def _on_load_finished(self):
                self.html = self.toHtml(self.Callable)
                print('Load finished')

            def Callable(self, html_str):
                self.html = html_str
                self.app.quit()



page = Page(webliste[0+1])   

filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")


headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";"  + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";"  + "Anbieter\n"

f.write(headers)



def main():

    soup = bs.BeautifulSoup(page.html, 'html.parser')

    containers = soup.findAll("div", {"class":"oax_dp_snippet"})

    print ("Anzahl der gefundenen touren", len(containers))
#loop

    for container in containers:

        tour_container = container.findAll("span",{"dir":"auto"})
        cont = tour_container[0].text
        print("Name der Tour: ", cont)
        tour_name = cont

        tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
        leange = tour_data[0].text.strip()
        zeit = tour_data[1].text.strip()
        aufstieg = tour_data[2].text.strip()
        abstieg = tour_data[3].text.strip()

        print("Länge der Tour: ", leange)
        print("Länge der Tour: ", zeit)
        print("Länge der Tour: ", aufstieg)
        print("Länge der Tour: ", abstieg)

        link = container.a["href"]
        link_a = link
        print ("Link zur Tour: ", link)

        tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
        anbieter = tour_anbieter[0].text.strip()
        print("Tourenanbieter: ", anbieter)
        f.write(tour_name + ";" + leange + ";" + zeit + ";"  + aufstieg + ";" + abstieg + ";" + link+ ";"  + anbieter+ "\n")

    f.close()



if __name__ == '__main__': main()

@Steve Haigh,谢谢你给我的第二个建议是最好的。 知道一切正常。 我知道它不是很性感的程序,但它可以工作;)

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs


class Page(QWebEnginePage):

    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()

webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.32663,49.07201')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.30002,49.0945")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.22097,49.11664")




filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")


headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";"  + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";"  + "Anbieter\n"

f.write(headers)

def main():

    for i in range(3):

        page = Page(webliste[i])
        soup = bs.BeautifulSoup(page.html, 'html.parser')
        containers = soup.findAll("div", {"class":"oax_dp_snippet"})
        print ("Anzahl der gefundenen touren", len(containers))

        for container in containers:
            tour_container = container.findAll("span",{"dir":"auto"})
            cont = tour_container[0].text
            print("Name der Tour: ", cont)
            tour_name = cont


            tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
            leange = tour_data[0].text.strip()
            zeit = tour_data[1].text.strip()
            aufstieg = tour_data[2].text.strip()
            abstieg = tour_data[3].text.strip()

            print("Länge der Tour: ", leange)       
            print("Länge der Tour: ", zeit)
            print("Länge der Tour: ", aufstieg)
            print("Länge der Tour: ", abstieg)

            link = container.a["href"]
            link_a = link
            print ("Link zur Tour: ", link)

            tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
            anbieter = tour_anbieter[0].text.strip()
            print("Tourenanbieter: ", anbieter)



            f.write(tour_name + ";" + leange + ";" + zeit + ";"  + aufstieg + ";" + abstieg + ";" + link+ ";"  + anbieter+ "\n")











    f.close()



if __name__ == '__main__': main()

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM