繁体   English   中英

我对抓取很陌生,请耐心等待,这是我的第一个项目。 我正在尝试使用 selenium 抓取网站

[英]I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium

"problem lines"
                                        for_tariff_loop = driver.find_elements_by_xpath("//span[@class='phx-radio__element']")
                                        radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
                                        print(radio_label_list)
                                        time.sleep(1)

我正在抓取的网站https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb

标签图像我无法根据选中的按钮打印单选按钮标签。 我不知道错误是什么以及我在哪里做的。 任何人都可以帮助解决这个问题。 对我学习会有帮助。 更改以下链接中给出的关税链接,

import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

class telekommobiles:
    def __init__(self):
        self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
        self.country='DE'
        self.currency='GBP'
        self.VAT='Included'
        self.shipping = 'free shipping within 3-4 weeks'
        self.Pre_PromotionPrice ='N/A'
        self.color ='N/A'
    def telekom(self):
        #try:
            driver=webdriver.Chrome()
            driver.maximize_window()          
            driver.get(self.url)
            today = date.today()
            time.sleep(5)
            cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
            print("cookies accepted")            
            links_prod_check = []
            prod_models = []
            prod_manufacturer =[]
            prod_memorys = []
            product_colors =[]
            product_price_monthly_payments = []
            product_price_one_time_payments =[]
            product_links = []
            containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
            i = 1            
            for container in containers:
                p_links =container.find_element_by_tag_name('a').get_attribute('href')
                i = i + 1
                product_links.append(p_links)
                #print(p_links)
            for links in product_links:
                driver.get(links)
                #time.sleep(5)
                
                #print(driver.current_url)
                #links_prod_check.append(driver.current_url)

                coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[@data-qa='list_ColorVariant']")))
                #print(coloroptions)
                for i in range(len(coloroptions)):
                    coloroption = driver.find_elements_by_xpath("//li[@data-qa='list_ColorVariant']")
                    coloroption[i].click()
                    #print(coloroption[i])
                    time.sleep(3)

                    memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[@class='phx-radio__element']")))
                    for i in range(len(memoryoptions)):
                        memoryoption = driver.find_elements_by_xpath("//span[@class='phx-radio__element']")
                        try:
                            memoryoption[i].click()
                        except:
                            pass

                        time.sleep(5)
                        change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
                        time.sleep(3)
                        #looping for each section 
                        section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')   
                        #print(len(section_loops))
                        for section_loop in section_loops:
                            #print(section_loop)
                            time.sleep(5)
                            #Headings
                            heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
                            print(heading_1)                          
                            # looping for each separate boxes
                            each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
                            #print(len(each_box_subcontainers))
                            for subcontainer in each_box_subcontainers:
                                #print(subcontainer)
                                looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[@class='phx-radio__element']")))
                                #print(looping_for_tariff)
                                for i in range(len(looping_for_tariff)):
                                    #print(i)
                                    try:
                                        for_tariff_loop = driver.find_elements_by_xpath("//span[@class='phx-radio__element']")                                        
                                        for_tariff_loop[i].click()
                                        time.sleep(3)
                                    except:
                                        pass
                                        
                                    for_tariff_loop = driver.find_elements_by_xpath("//span[@class='phx-radio__element']")
                                    radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
                                    print(radio_label_list)
                                    time.sleep(1)




                        change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()

        
telekom_de=telekommobiles()
telekom_de.telekom()

您正在尝试在元素中查找元素。 寻找radio_label_list使用for_tariff_loop[i] ,XPath来radio_label_list会变得象下面这样:

//span[@class='phx-radio__element']//span[@class="phx-radio__label"]

DOM 中不存在。

我尝试了代码的最后一部分。 并且能够打印如下所示的内存大小 请尝试并确认:

更换css-selectorradio_label_list与此XPath ./following-sibling::span

looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[@class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
    # print(i)
    try:
        for_tariff_loop = driver.find_elements_by_xpath("//span[@class='phx-radio__element']")
        for_tariff_loop[i].click()
        time.sleep(3)
    except:
        pass

    for_tariff_loop = driver.find_elements_by_xpath("//span[@class='phx-radio__element']")
    radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
    print(radio_label_list)
    time.sleep(1)

根据评论,检查此代码:

driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()

wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(@class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()

length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))

for i in range(length):
    print("----------------------------------------------------------------------------------------------------------")
    options = driver.find_elements_by_class_name("phx-tariff-box__section")
    datas = options[i].find_element_by_xpath(".//div[contains(@class,'phx-tariff-box__volume')]").get_attribute("innerText")
    print("data: {}".format(datas))
    len_types = len(options[i].find_elements_by_xpath(".//div[@class='phx-tariff-box__radios-inner']//label"))
    types = options[i].find_elements_by_xpath(".//div[@class='phx-tariff-box__radios-inner']//label")
    if len(types) == 0:
        price = options[i].find_element_by_xpath(".//p[@data-qa='block_TariffPrice']").get_attribute("innerText")
        print(price)
    else:
        for j in range(len_types):
            types[j].click()
            time.sleep(2)
            options = driver.find_elements_by_class_name("phx-tariff-box__section")
            types = options[i].find_elements_by_xpath(".//div[@class='phx-tariff-box__radios-inner']//label")
            try:
                types[j].find_element_by_xpath("./input[@checked]")
                type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
                price = options[i].find_element_by_xpath(".//p[@data-qa='block_TariffPrice']").get_attribute("innerText")
                print(f"{type}: {price}")
            except:
                pass

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM