Python - BeautifulSoup - 抓取的內容僅寫入第一個文本文件，不寫入后續文件

Question

我目前正在使用下面的代碼從運動日程網站和 output 將信息抓取到文本文件中。 目前使用我擁有的代碼，數據正確打印到控制台，並且來自第一個 URL （ https://sport-tv-guide.live/live/darts ）的數據按預期輸出到文本文件。

問題是第二個 URL ( https://sport-tv-guide.live/live/boxing/ ) 中的內容沒有輸出到預期的文本文件中（文本文件已創建，但其中沒有內容） .

我正在使用的代碼如下：

import requests
import time
from bs4 import BeautifulSoup

def makesoup(url):
    cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London',  'user_time_zone_id': '1'} 
    r = requests.post(url,  cookies=cookies)
    return BeautifulSoup(r.text,"lxml")
   
def linkscrape(links, savefile):
    baseurl = "https://sport-tv-guide.live"
    urllist = []
    
    for link in links:
        finalurl = (baseurl+ link['href'])
        urllist.append(finalurl)
        # print(finalurl)
        
    for singleurl in urllist:
        soup2=makesoup(url=singleurl)
        g_data=soup2.find_all('div', {'id': 'channelInfo'})
        c_data=soup2.find_all('div', {'class': 'liveOtherStations clearfix'})
    
    with open(savefile ,"w") as text_file:
        
        for match in g_data:
            try:
                hometeam =  match.find_previous('div', class_='cell40 text-center teamName1').text.strip()
                awayteam =  match.find_previous('div', class_='cell40 text-center teamName2').text.strip()
                print("Competitors; ", hometeam +" "+ "vs" +" "+ awayteam)
            except:
                hometeam = "Home Team element not found"
                awayteam = "Away Team element not found"
            try:
                startime =  match.find('div', class_='time full').text.strip()
                print("Time; ", startime) 
            except:
                startime = "Time element not found"
            try:
                event=  match.find('div', class_='title full').text.strip()
                print("Event:",  event)
            except:
                event = "Event element not found"
            try:
                dateandtime = match.find('div', class_='date full').text.strip()
                print("Date:",  dateandtime)
            except:
                dateandtime = "Date not found"
            try:
                sport = match.find('div', class_='text full').text.strip()
                print("Sport:",  sport)
            except:
                sport = "Sport element not found"
            try:
                singlechannel = match.find('div', class_='station full').text.strip()
                print("Main Channel:",  singlechannel)
                print("-----")
            except:
                singlechannel = "Single Channel element not found"
            for channel in c_data:
                try:
                    channels = match.find('div', class_='stationLive active col-wrap')
                    print("Extra Channels:",  channel.text)
                except:
                    channels = "No channels found"
                    print(channels)
                print("-------")
                
                text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ hometeam + awayteam + event+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + ' '+channel.text+" \n"+'-' *20 + " \n")


            
def matches():
    
    dict = {"https://sport-tv-guide.live/live/darts/":"/home/brendan/Desktop/testing,txt",  
"https://sport-tv-guide.live/live/boxing/":"/home/brendan/Desktop/boxing.txt"}

    for key,  value  in dict.items():
        soup=makesoup(url = key)
        linkscrape(links= soup.find_all('a', {'class': 'article flag',  'href' : True}) , savefile = value)
        
matches()

下圖顯示了我正在打印到控制台的 output，它顯示正確。

我認為可能存在用於打開文本文件的 while 循環 position 的問題，導致它被創建，但實際.writelines function 在成功創建第一個文本文件后無法正確運行。 我已經嘗試過從 while 循環開始的所有代碼，但這對 output 沒有影響。

不幸的是，我不確定如何從這里開始。

感謝任何可以提供幫助或解決此問題的人。

Answer 1

發現了問題。 在您的代碼中，對於boxing url - https://sport-tv-guide.live/live/boxing/沒有額外的頻道。 因此，控件不會在循環內 go 並且沒有 output 寫入文件。

您可以在列表中收集所有額外的頻道，然后寫入文件

import requests
import time
from bs4 import BeautifulSoup

def makesoup(url):
    cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London',  'user_time_zone_id': '1'} 
    r = requests.post(url,  cookies=cookies)
    return BeautifulSoup(r.text,"lxml")
   
def linkscrape(links, savefile):
    baseurl = "https://sport-tv-guide.live"
    urllist = []
    print(savefile)
    for link in links:
        finalurl = (baseurl+ link['href'])
        urllist.append(finalurl)
        # print(finalurl)
        
    for singleurl in urllist:
        soup2=makesoup(url=singleurl)
        g_data=soup2.find_all('div', {'id': 'channelInfo'})
        c_data=soup2.find_all('div', {'class': 'liveOtherStations clearfix'})
    
    with open(savefile ,"w") as text_file:
        
        for match in g_data:
            try:
                hometeam =  match.find_previous('div', class_='cell40 text-center teamName1').text.strip()
                awayteam =  match.find_previous('div', class_='cell40 text-center teamName2').text.strip()
                print("Competitors; ", hometeam +" "+ "vs" +" "+ awayteam)
            except:
                hometeam = "Home Team element not found"
                awayteam = "Away Team element not found"
            try:
                startime =  match.find('div', class_='time full').text.strip()
                print("Time; ", startime) 
            except:
                startime = "Time element not found"
            try:
                event=  match.find('div', class_='title full').text.strip()
                print("Event:",  event)
            except:
                event = "Event element not found"
            try:
                dateandtime = match.find('div', class_='date full').text.strip()
                print("Date:",  dateandtime)
            except:
                dateandtime = "Date not found"
            try:
                sport = match.find('div', class_='text full').text.strip()
                print("Sport:",  sport)
            except:
                sport = "Sport element not found"
            try:
                singlechannel = match.find('div', class_='station full').text.strip()
                print("Main Channel:",  singlechannel)
                print("-----")
            except:
                singlechannel = "Single Channel element not found"
            
            extra_channels = []

            for channel in c_data:
                try:
                    channels = match.find('div', class_='stationLive active col-wrap')
                    print("Extra Channels:",  channel.text)
                    extra_channels.append(channel.text)
                except:
                    channels = "No channels found"
                    print(channels)
                    extra_channels.append(channel.text)
                print("-------")
            
            if extra_channels:
                for channel in extra_channels:    
                    text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ hometeam + awayteam + event+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + ' '+channel+" \n"+'-' *20 + " \n")
            else:
                text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ hometeam + awayteam + event+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + " \n"+'-' *20 + " \n")
            


            
def matches():
    
    dict = {"https://sport-tv-guide.live/live/darts/":"testing.txt",  
"https://sport-tv-guide.live/live/boxing/":"boxing.txt"}

    for key,  value  in dict.items():
        soup=makesoup(url = key)
        linkscrape(links= soup.find_all('a', {'class': 'article flag',  'href' : True}) , savefile = value)
        
matches()

Python - BeautifulSoup - 抓取的內容僅寫入第一個文本文件，不寫入后續文件

問題描述

1 個解決方案

解決方案1
2 已采納 2020-07-24 06:24:52

Python - BeautifulSoup - 抓取的內容僅寫入第一個文本文件，不寫入后續文件

問題描述

1 個解決方案

解決方案1 2 已采納 2020-07-24 06:24:52

解決方案1
2 已采納 2020-07-24 06:24:52