簡體   English   中英

Beautiful Soup find_All 沒有找到所有塊

[英]Beautiful Soup find_All doesn't find all blocks

我正在嘗試解析 headhunter.kz 網站。 使用中:python 3.9、beautifulsoup4。 當我解析有空缺的頁面時,我只解析了 20 個帶有“serp-item”類的 div 塊,實際上有 40 個 div 塊。 (我在瀏覽器中打開 html 文件,看到有 40 個塊)。

import requests
import os
import time
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
df = pd.DataFrame({})
global_url = "https://almaty.hh.kz/"
headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
    }
def get_all_pages():
    

    with open("data/page_1.html") as file:
        src = file.read()
#

    soup = BeautifulSoup(src,"lxml")
    #find("span", {"class":"pager-item-not-in-short-range"}).
    pages_count = int(soup.find("div",{"class":"pager"}).find_all("a")[-2].text)
    for i in range(1,pages_count+1):
        url = f"https://almaty.hh.kz/search/vacancy?area=160&clusters=true&enable_snippets=true&ored_clusters=true&professional_role=84&professional_role=116&professional_role=36&professional_role=157&professional_role=125&professional_role=156&professional_role=160&professional_role=10&professional_role=150&professional_role=25&professional_role=165&professional_role=73&professional_role=96&professional_role=164&professional_role=104&professional_role=112&professional_role=113&professional_role=148&professional_role=114&professional_role=121&professional_role=124&professional_role=20&search_period=30&hhtmFrom=vacancy_search_list&page={i}"
        r = requests.get(url = url,headers = headers)
        with open(f"data/page_{i}.html","w") as file:
            file.write(r.text)

        time.sleep(3)

    return pages_count+1


def collect_data(pages_count):
    for page in range(1, pages_count+1):
        with open(f"data/page_{page}.html") as file:
            src = file.read()

            soup = BeautifulSoup(src,"lxml")
            # item_cards = soup.find_all("div",{"class":"a-card__body ddl_product_link"})
            # print(len(item_cards))
            # for items in item_cards:
            #   product_title = items.find("a",{"class":"a-card__title link"}).text 
            #   product_price = items.find("span",{"class":"a-card__price-text"}).text
            #   product_geo = items.find("div",{"class":"a-card__subtitle"}).text
            #   print(f"Title:{product_title} - Price: {product_price} - GEO: {product_geo}")
            #items_divs = soup.find_all("div",{"class":"serp-item"})
            items_divs = soup.find_all("div",{"class":"serp-item"})
            print(len(items_divs))
            urls =[]
            for item in items_divs:
                item_url = item.find("span",{"data-page-analytics-event":"vacancy_search_suitable_item"}).find("a",{"class":"serp-item__title"}).get("href")
                urls.append(item_url)
            with open("items_urls.txt","w") as file:
                for url in urls:
                    file.write(f"{url}\n")
            get_data(file_path="items_urls.txt")

def get_data(file_path):
    result_list = []
    with open(file_path) as file:
        urls_list = file.readlines()
        clear_urls_list =[]
        for url in urls_list:
            url = url.strip()
            clear_urls_list.append(url)
    
    i=0
    for url in clear_urls_list:
        i+=1
        response = requests.get(url=url,headers=headers)
        soup = BeautifulSoup(response.text,"lxml")


        try:
            item_name = soup.find("div",{"class":"main-content"}).find("h1",{"data-qa":"vacancy-title"}).text.strip()
        except:
            item_name = 'E1'

        try:
            item_salary = soup.find("div",{"class":"main-content"}).find("div",{"data-qa":"vacancy-salary"}).text.strip()
        except:
            item_salary = 'E2'

        try:
            item_exp = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-experience"}).text.strip()
        except:
            item_exp = 'E3'

        try:
            company_name = soup.find("div",{"class":"main-content"}).find("span",{"class":"vacancy-company-name"}).find("span").text.strip()
        except:
            company_name = 'E4'

        try:
            if soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}):
                date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time-redesigned"}).text.strip()
            else:
                date = soup.find("div",{"class":"main-content"}).find("p",{"class":"vacancy-creation-time"}).text.strip()
        except:
            date = 'E5'

        try:
            if soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}):
                address = soup.find("div",{"class":"main-content"}).find("span",{"data-qa":"vacancy-view-raw-address"}).text
            elif soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}):
                address = soup.find("div",{"class":"main-content"}).find("div",{"class":"vacancy-company-bottom"}).find("p", {"data-qa":"vacancy-view-location"}).text
            elif soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}):
                address = soup.find("div",{"class":"main-content"}).find("div",{"class":"block-employer--jHuyqacEkkrEkSl3Yg3M"}).find("p", {"data-qa":"vacancy-view-location"}).text
        except:
            address = 'Алматы'

        try:
            zanyatost = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).find("span").text.strip()
        except:
            zanyatost = 'E7'

        try:
            zanyatost2 = soup.find("div",{"class":"main-content"}).find("p",{"data-qa":"vacancy-view-employment-mode"}).text.lstrip(', ')
        except:
            zanyatost2 = 'E8'
        print(i)

        with open('test.csv','a',encoding ="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(
                (
                    item_name,
                    item_salary,
                    item_exp,
                    company_name,
                    date,
                    address,
                    zanyatost,
                    zanyatost2
                )
            )


def main():
    with open('test.csv','w',encoding ="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(
                (
                    'Должность',
                    "Зарплата",
                    "Опыт",
                    "Компания",
                    "Дата обьявления",
                    "Район",
                    "Тип занятости",
                    "Тип занятости2"
                )
            )
    pages_count = get_all_pages()
    #print(pages_count)

    collect_data(pages_count=pages_count)
    # #get_data(file_path="items_urls.txt")
    # df.to_excel('./test.xlsx')

if __name__ == '__main__':
    main()

我嘗試使用 html5lib、html.parser 和 lxml,但結果相同。 我也嘗試使用 soup.select 來查找帶有“serp-item”class 的 div-block 的數量,但它給了我相同的結果。 我認為,來自剩余塊的信息存儲在 JS 中,如果我是對的,有人可以解釋一下,如何解析剩余塊嗎?

我認為您應該使用 selenium 並在解析任何數據之前嘗試滾動到頁面末尾


# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM