簡體   English   中英

使用 Beautiful Soup 和 Pandas 從網站抓取數據

[英]Scraping data from website using Beautiful Soup and Pandas

I have a python script that use the BeautifulSoup and Pandas packages in order to scrape data from a list of urls and convert the data into a dataframe then save it as excel file and send it by email as attachment.

問題是,當腳本運行並完成第一項的抓取時,它會崩潰並返回以下錯誤:

ValueError: 15 columns passed, passed data had 14 columns

我認為這意味着缺少html 標簽對嗎?

該列表包括 3 個網址。

代碼:

import time
from datetime import date
import smtplib

import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate

def scrape_website():
    url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
                "https://www.bayt.com/en/international/jobs/head-chef-jobs/",
                "https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]
    for url in url_list:
        soup = BeautifulSoup(requests.get(url).content, "lxml")

        links = []
        for a in soup.select("h2.m0.t-regular a"):
            if a['href'] not in links:
                links.append("https://www.bayt.com" + a['href'])
        joineddd = []

        for link in links:
            s = BeautifulSoup(requests.get(link).content, "lxml")
            alldd = [dd.text for dd in s.select(
                "div[class='card-content is-spaced'] dd")]
            alldd.insert(0, link)
            joineddd.append(alldd)
        print("Web Crawling is Done for  {}".format(url))
        convert_to_dataFrame(joineddd)
    send_email()

def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

def convert_to_dataFrame(joineddd):
    df = pd.DataFrame(joineddd, columns=[
        "link", "location", "Company_Industry", "Company_Type",
        "Job_Role", "Employment_Type", "Monthly_Salary_Range",
        "Number_of_Vacancies", "Career_Level",
        "Years_of_Experience", "Residence_Location",
        "Gender","Nationality","Degree","Age"])
    df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])
    df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
    send_email()

def send_email():
    '''send email '''
    today = date.today()
    file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
    username='XXXXXXXXXXX'
    password='XXXXXXXXXXXXX'
    send_from = 'XXXXXXXXXXXXX'
    send_to = 'XXXXXXXXXXXXXX'
    Cc = 'recipient'
    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = send_to
    msg['Cc'] = Cc
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = 'Hello, This is a test mail {}'.format(today)
    server = smtplib.SMTP('smtp.gmail.com')
    port = '587'
    fp = open(file, 'rb')
    part = MIMEBase('application', 'vnd.ms-excel')
    part.set_payload(fp.read())
    fp.close()
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
    msg.attach(part)
    smtp = smtplib.SMTP('smtp.gmail.com')
    smtp.ehlo()
    smtp.starttls()
    smtp.login(username, password)
    smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
    smtp.quit()
    print('Mail Sent')


if __name__ == "__main__":
    scrape_website()

更新 func scrape_website() ,將 alldd 保存為字典。

for link in links:
    s = BeautifulSoup(requests.get(link).content, "lxml") 
    ### update Start ###
    alldd = dict()
    alldd['link'] = link
    dd_div = [i for i in s.select("div[class='card-content is-spaced'] div") 
              if ('<dd>' in str(i) ) and ( "<dt>" in str(i))]
    for div in dd_div:
        k = div.select_one('dt').get_text(';', True)
        v = div.select_one('dd').get_text(';', True)
        alldd[k] = v
    ### update End  ###    
    joineddd.append(alldd)


# result
df = pd.DataFrame(joineddd)

全部添加樣本:


{
         'link': 'https://www.bayt.com/en/qatar/jobs/executive-chef-4298309/',       
         'Job Location': 'Doha, Qatar',
         'Company Industry': 'Real Estate; Hospitality & Accomodation; Catering, Food Service, & Restaurant',
         'Company Type': 'Employer (Private Sector)',
         'Job Role': 'Hospitality and Tourism',
         'Employment Type': 'Unspecified',
         'Monthly Salary Range': 'Unspecified',
         'Number of Vacancies': 'Unspecified',
         'Career Level': 'Mid Career',
         'Years of Experience': 'Min: 7',
         'Residence Location': 'Qatar',
         'Degree': "Bachelor's degree / higher diploma"
}

ValueError: 15 columns passed, passed data had 14 columns

我在這里讀到的意思是您將 dataframe 指定為 15 列,但您輸入的數據只有 14 個特征。 您需要檢查原始文件以確保它確實包含您期望的數據,或者調整您期望的列及其名稱以匹配文件。

讓我們清理其中的一些代碼。

  1. 您不需要編寫 function 來刪除列,已經有一種方法可以使用.drop()來做到這一點。 所以刪除 function remove_unwanted_cols(dataset, cols)並簡單地改變這一行:

df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])

df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)

  1. 你的意思是讓它發送 email 兩次? 您可以在scrape_website()函數和convert_to_dataFrame() function 中執行此操作。

  2. 如果提取數據來構建 dataframe,我通常會盡量避免使用列表,因為您會得到某些站點有 x 列的錯誤的確切原因,但下一次抓取有額外的一個(或 len 中不匹配)。 字典是一種更好的處理方式,鍵是列名,並為數據賦值。 所以你會有一個字典列表。 列表中的每一項都是一行,每個字典對應一列的值。 然后你可以去掉convert_to_dataFrame() function 因為 pandas 可以為你做這件事,但我們會把它留在那里,你可以保留它,或者如果你願意,可以刪除它。

  3. 如果您使用r''作為字符串,則不需要對\進行字符轉義。 要么做: r"F:\AIenv\web_scrapping\jobDesc.xlsx" ,要么"F:\\AIenv\web_scrapping\\jobDesc.xlsx"

代碼:

import time
from datetime import date
import smtplib

import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate

def scrape_website(url):
    soup = BeautifulSoup(requests.get(url).content, "lxml")
    subject = url.split('/')
    subject = [x for x in subject if x != ''][-1]
    links = []
    for a in soup.select("h2.m0.t-regular a"):
        if a['href'] not in links:
            links.append("https://www.bayt.com" + a['href'])
    
    joineddd = []
    for link in links:
        row = {}
        s = BeautifulSoup(requests.get(link).content, "lxml")
        job_description = s.find('h2', text='Job Description').find_next('dl')
        data_titles = job_description.find_all('dt')
        for data_title in data_titles:
            dt = '_'.join(data_title.text.split())
            dd = data_title.find_next('dd').text.strip()
            row.update({dt: dd})
                    
        if s.find('h2', text='Preferred Candidate'):
            preferred_candidate = s.find('h2', text='Preferred Candidate').find_next('dl')
            data_titles = preferred_candidate.find_all('dt')
            for data_title in data_titles:
                dt = '_'.join(data_title.text.split())
                dd = data_title.find_next('dd').text.strip()
                row.update({dt: dd})
            
        joineddd.append(row)
            
    print("Web Crawling is Done for  {}".format(url))
    convert_to_dataFrame(joineddd, subject)
    #send_email(subject) #<-- did you want to send here?

def convert_to_dataFrame(joineddd, subject):
    df = pd.DataFrame(joineddd)
    df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)
    df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
    send_email(subject)  #<--or do you want to send here??

def send_email(subject):
    '''send email '''
    today = date.today()
    file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
    username='XXXXXXXXXXX'
    password='XXXXXXXXXXXXX'
    send_from = 'XXXXXXXXXXXXX'
    send_to = 'XXXXXXXXXXXXXX'
    Cc = 'recipient'
    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = send_to
    msg['Cc'] = Cc
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = 'Hello, This is a test mail {} - {}'.format(today,subject)
    server = smtplib.SMTP('smtp.gmail.com')
    port = '587'
    fp = open(file, 'rb')
    part = MIMEBase('application', 'vnd.ms-excel')
    part.set_payload(fp.read())
    fp.close()
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
    msg.attach(part)
    smtp = smtplib.SMTP('smtp.gmail.com')
    smtp.ehlo()
    smtp.starttls()
    smtp.login(username, password)
    smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
    smtp.quit()
    print('Mail Sent')


url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
            "https://www.bayt.com/en/international/jobs/head-chef-jobs/",
            "https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]

if __name__ == "__main__":
    for url in url_list:
        scrape_website(url)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM