简体   繁体   中英

Scraping data from website using Beautiful Soup and Pandas

I have a python script that use the BeautifulSoup and Pandas packages in order to scrape data from a list of urls and convert the data into a dataframe then save it as excel file and send it by email as attachment.

The problem is that when the script run and finish the scraping of the first item it crash and return the error below:

ValueError: 15 columns passed, passed data had 14 columns

I think this means that there is a missing html tag right??

The list includes 3 urls.

code:

import time
from datetime import date
import smtplib

import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate

def scrape_website():
    url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
                "https://www.bayt.com/en/international/jobs/head-chef-jobs/",
                "https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]
    for url in url_list:
        soup = BeautifulSoup(requests.get(url).content, "lxml")

        links = []
        for a in soup.select("h2.m0.t-regular a"):
            if a['href'] not in links:
                links.append("https://www.bayt.com" + a['href'])
        joineddd = []

        for link in links:
            s = BeautifulSoup(requests.get(link).content, "lxml")
            alldd = [dd.text for dd in s.select(
                "div[class='card-content is-spaced'] dd")]
            alldd.insert(0, link)
            joineddd.append(alldd)
        print("Web Crawling is Done for  {}".format(url))
        convert_to_dataFrame(joineddd)
    send_email()

def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

def convert_to_dataFrame(joineddd):
    df = pd.DataFrame(joineddd, columns=[
        "link", "location", "Company_Industry", "Company_Type",
        "Job_Role", "Employment_Type", "Monthly_Salary_Range",
        "Number_of_Vacancies", "Career_Level",
        "Years_of_Experience", "Residence_Location",
        "Gender","Nationality","Degree","Age"])
    df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])
    df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
    send_email()

def send_email():
    '''send email '''
    today = date.today()
    file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
    username='XXXXXXXXXXX'
    password='XXXXXXXXXXXXX'
    send_from = 'XXXXXXXXXXXXX'
    send_to = 'XXXXXXXXXXXXXX'
    Cc = 'recipient'
    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = send_to
    msg['Cc'] = Cc
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = 'Hello, This is a test mail {}'.format(today)
    server = smtplib.SMTP('smtp.gmail.com')
    port = '587'
    fp = open(file, 'rb')
    part = MIMEBase('application', 'vnd.ms-excel')
    part.set_payload(fp.read())
    fp.close()
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
    msg.attach(part)
    smtp = smtplib.SMTP('smtp.gmail.com')
    smtp.ehlo()
    smtp.starttls()
    smtp.login(username, password)
    smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
    smtp.quit()
    print('Mail Sent')


if __name__ == "__main__":
    scrape_website()

update func scrape_website() , save alldd as dictionary.

for link in links:
    s = BeautifulSoup(requests.get(link).content, "lxml") 
    ### update Start ###
    alldd = dict()
    alldd['link'] = link
    dd_div = [i for i in s.select("div[class='card-content is-spaced'] div") 
              if ('<dd>' in str(i) ) and ( "<dt>" in str(i))]
    for div in dd_div:
        k = div.select_one('dt').get_text(';', True)
        v = div.select_one('dd').get_text(';', True)
        alldd[k] = v
    ### update End  ###    
    joineddd.append(alldd)


# result
df = pd.DataFrame(joineddd)

alladd sample:


{
         'link': 'https://www.bayt.com/en/qatar/jobs/executive-chef-4298309/',       
         'Job Location': 'Doha, Qatar',
         'Company Industry': 'Real Estate; Hospitality & Accomodation; Catering, Food Service, & Restaurant',
         'Company Type': 'Employer (Private Sector)',
         'Job Role': 'Hospitality and Tourism',
         'Employment Type': 'Unspecified',
         'Monthly Salary Range': 'Unspecified',
         'Number of Vacancies': 'Unspecified',
         'Career Level': 'Mid Career',
         'Years of Experience': 'Min: 7',
         'Residence Location': 'Qatar',
         'Degree': "Bachelor's degree / higher diploma"
}

ValueError: 15 columns passed, passed data had 14 columns

What I read here means that you designated the dataframe to have 15 columns, but the data that you feed it only has 14 features. You need to check your original file to make sure it actually has the data you expect, or adjust your expected columns and their names to match the file.

Let's clean up some of this code.

  1. You don't need to write a function to remove columns, there's already a method to do that with .drop() . So delete the function remove_unwanted_cols(dataset, cols) and simply change the line:

df = remove_unwanted_cols(df, ["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"])

to

df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)

  1. Do you mean to have it send email twice? You have it do it in the scrape_website() finction and the convert_to_dataFrame() function.

  2. If pulling data to construct a dataframe, I usually try to avoid lists, for the exact reason you get the error of some sites have x columns, but the next scrape has and extra one (or not a match in len). Dictionaries are a better way to handle that with the key being the column name, and value the data. So you'll have a list of dictionaries. Each item in the list is a row, and each dictionary corresponds to the value for a column. Then you can get rid of the convert_to_dataFrame() function as pandas can do that for you, but we'll leave that in there and you can keep it, or remove it, if you like.

  3. If you are using r'' for your strings, you don't need to character escape the \ . Either do: r"F:\AIenv\web_scrapping\jobDesc.xlsx" , or "F:\\AIenv\web_scrapping\\jobDesc.xlsx"

Code:

import time
from datetime import date
import smtplib

import requests
import pandas as pd
from bs4 import BeautifulSoup
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from email.utils import formatdate

def scrape_website(url):
    soup = BeautifulSoup(requests.get(url).content, "lxml")
    subject = url.split('/')
    subject = [x for x in subject if x != ''][-1]
    links = []
    for a in soup.select("h2.m0.t-regular a"):
        if a['href'] not in links:
            links.append("https://www.bayt.com" + a['href'])
    
    joineddd = []
    for link in links:
        row = {}
        s = BeautifulSoup(requests.get(link).content, "lxml")
        job_description = s.find('h2', text='Job Description').find_next('dl')
        data_titles = job_description.find_all('dt')
        for data_title in data_titles:
            dt = '_'.join(data_title.text.split())
            dd = data_title.find_next('dd').text.strip()
            row.update({dt: dd})
                    
        if s.find('h2', text='Preferred Candidate'):
            preferred_candidate = s.find('h2', text='Preferred Candidate').find_next('dl')
            data_titles = preferred_candidate.find_all('dt')
            for data_title in data_titles:
                dt = '_'.join(data_title.text.split())
                dd = data_title.find_next('dd').text.strip()
                row.update({dt: dd})
            
        joineddd.append(row)
            
    print("Web Crawling is Done for  {}".format(url))
    convert_to_dataFrame(joineddd, subject)
    #send_email(subject) #<-- did you want to send here?

def convert_to_dataFrame(joineddd, subject):
    df = pd.DataFrame(joineddd)
    df = df.drop(["Company_Industry","Company_Type","Job_Role","Number_of_Vacancies"], axis=1)
    df_to_excel = df.to_excel(r"F:\\AIenv\web_scrapping\\jobDesc.xlsx", index = False, header=True)
    send_email(subject)  #<--or do you want to send here??

def send_email(subject):
    '''send email '''
    today = date.today()
    file = 'F:\\AIenv\web_scrapping\\jobDesc.xlsx'
    username='XXXXXXXXXXX'
    password='XXXXXXXXXXXXX'
    send_from = 'XXXXXXXXXXXXX'
    send_to = 'XXXXXXXXXXXXXX'
    Cc = 'recipient'
    msg = MIMEMultipart()
    msg['From'] = send_from
    msg['To'] = send_to
    msg['Cc'] = Cc
    msg['Date'] = formatdate(localtime=True)
    msg['Subject'] = 'Hello, This is a test mail {} - {}'.format(today,subject)
    server = smtplib.SMTP('smtp.gmail.com')
    port = '587'
    fp = open(file, 'rb')
    part = MIMEBase('application', 'vnd.ms-excel')
    part.set_payload(fp.read())
    fp.close()
    encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment', filename='jobs Description--{}'.format(today))
    msg.attach(part)
    smtp = smtplib.SMTP('smtp.gmail.com')
    smtp.ehlo()
    smtp.starttls()
    smtp.login(username, password)
    smtp.sendmail(send_from, send_to.split(',') + msg['Cc'].split(','), msg.as_string())
    smtp.quit()
    print('Mail Sent')


url_list = ["https://www.bayt.com/en/international/jobs/executive-chef-jobs/",
            "https://www.bayt.com/en/international/jobs/head-chef-jobs/",
            "https://www.bayt.com/en/international/jobs/executive-sous-chef-jobs/"]

if __name__ == "__main__":
    for url in url_list:
        scrape_website(url)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM