简体   繁体   中英

Python transversal through all a links selenium

How can I go through all pages? I want to do page transversal through all pages. I understand how to do it while just grabbing a URL by itself. But I don't know how to do it without grabbing one. The issue right now is that there's two get's and the second one pretty much ruins the entire program. What do I do? Pretty much I want to do this without using driver.current_url since in the full code it won't loop through due to that. Top code is where in simplest form, bottom code is half complete, in full complete (with login details) is where the looping becomes an issue.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
import csv
import re
import requests
from bs4 import BeautifulSoup

driver_service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window()  # load web driver
wait = WebDriverWait(driver, 5)

url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
print(url_template)
template = url_template+ '?page={}'

for page in range(1,5):
    driver.get(template.format(page))
    print("Job page number:",page)
    link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[@data-automation='jobTitle']")]
    for job in link_job:
        driver.get(job)
        try:
            quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[@data-automation='job-detail-apply' and @target='_self'])")))
            quick_apply.click()
        except:
            print("No records found " + job)
            pass


#---------------------------------------------------------------------------------
driver_service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window()  # load web driver
wait = WebDriverWait(driver, 5)
driver.get('https://www.seek.com.au')  # get login page for seek

job_keyword = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="keywords-input"]')))
jk = ('data')
job_keyword.send_keys(jk)
classification_type = driver.find_element(By.XPATH,
                                          '//*[@id="SearchBar"]/div[3]/div[1]/div/div/div[2]/div/div/label[2]').click()
sleep(1)
classification = driver.find_element(By.XPATH, '//*[@id="classificationsPanel"]/nav/ul/li[18]/a').click()  #
sleep(1)
classification_type = driver.find_element(By.XPATH,
                                          '//*[@id="SearchBar"]/div[3]/div[1]/div/div/div[2]/div/div/label[2]').click()
job_location = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="SearchBar__Where"]')))
location_search = 'Perth'
job_location.send_keys(location_search)
job_keyword.send_keys(Keys.RETURN)
sleep(5)
#looking_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[@data-automation='jobTitle']")]
#url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
get_url = ""
get_url += driver.current_url
print(url_template)
template = get_url+ '?page={}'




for page in range(1,5):
    driver.get(template.format(page))
    link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[@data-automation='jobTitle']")]
    for job in link_job:
        driver.get(job)
        #print(job)
        try:
            quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[@data-automation='job-detail-apply' and @target='_self'])")))
            quick_apply.click()
            #sleep(3)
        except:
            print("No records found " + job)
            pass
        sleep(3)

The website contains API and you can pull all the required data along with all the pages from API as get method as json data format using requests module only. Why not the easiest and the robust way?

Example:

import pandas as pd
import requests

params = {
    'siteKey': 'AU-Main',
    'sourcesystem': 'houston',
    'userqueryid': '7cb7b6cba52d0fc86c4955905aa0d15f-1359713',
    'userid': '0b604fc99bf7444eeda28ffaf2e10f06',
    'usersessionid': '0b604fc99bf7444eeda28ffaf2e10f06',
    'eventCaptureSessionId': '0b604fc99bf7444eeda28ffaf2e10f06',
    'where': 'All Perth WA',
    'page': '2',
    'seekSelectAllPages': 'true',
    'keywords': 'data',
    'classification': '6281',
    'hadPremiumListings': 'true',
    'include': 'seodata',
    'solId': '37215a53-00ec-496c-90b5-00228abf7cd7'
}
api_url= 'https://www.seek.com.au/api/chalice-search/search'
headers={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    }
data=[]
for params['page'] in range(1,22):
    res=requests.get(api_url,headers=headers)
    #print(res)
    for item in res.json()['data']:
        title = item['title']
        data.append({'title':title})
#print(data
df= pd.DataFrame(data)
print(df)

Output:

                                  title
0                              Front Desk Receptionist
1    Customer Service & Sales Superstar (hybrid/wor...
2                                 Academy Receptionist
3    Customer Service Administrator - Work from Hom...
4                                     Kennel Attendant
..                                                 ...
415                            Veterinary Receptionist
416                            Veterinary Receptionist
417  Bar, Catering & baristas staff required - No E...
418                             headspace Receptionist
419         Markerting / Office Administration Support

[420 rows x 1 columns]


 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM