How can I go through all pages? I want to do page transversal through all pages. I understand how to do it while just grabbing a URL by itself. But I don't know how to do it without grabbing one. The issue right now is that there's two get's and the second one pretty much ruins the entire program. What do I do? Pretty much I want to do this without using driver.current_url since in the full code it won't loop through due to that. Top code is where in simplest form, bottom code is half complete, in full complete (with login details) is where the looping becomes an issue.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
import csv
import re
import requests
from bs4 import BeautifulSoup
driver_service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window() # load web driver
wait = WebDriverWait(driver, 5)
url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
print(url_template)
template = url_template+ '?page={}'
for page in range(1,5):
driver.get(template.format(page))
print("Job page number:",page)
link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[@data-automation='jobTitle']")]
for job in link_job:
driver.get(job)
try:
quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[@data-automation='job-detail-apply' and @target='_self'])")))
quick_apply.click()
except:
print("No records found " + job)
pass
#---------------------------------------------------------------------------------
driver_service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window() # load web driver
wait = WebDriverWait(driver, 5)
driver.get('https://www.seek.com.au') # get login page for seek
job_keyword = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="keywords-input"]')))
jk = ('data')
job_keyword.send_keys(jk)
classification_type = driver.find_element(By.XPATH,
'//*[@id="SearchBar"]/div[3]/div[1]/div/div/div[2]/div/div/label[2]').click()
sleep(1)
classification = driver.find_element(By.XPATH, '//*[@id="classificationsPanel"]/nav/ul/li[18]/a').click() #
sleep(1)
classification_type = driver.find_element(By.XPATH,
'//*[@id="SearchBar"]/div[3]/div[1]/div/div/div[2]/div/div/label[2]').click()
job_location = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="SearchBar__Where"]')))
location_search = 'Perth'
job_location.send_keys(location_search)
job_keyword.send_keys(Keys.RETURN)
sleep(5)
#looking_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[@data-automation='jobTitle']")]
#url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
get_url = ""
get_url += driver.current_url
print(url_template)
template = get_url+ '?page={}'
for page in range(1,5):
driver.get(template.format(page))
link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[@data-automation='jobTitle']")]
for job in link_job:
driver.get(job)
#print(job)
try:
quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[@data-automation='job-detail-apply' and @target='_self'])")))
quick_apply.click()
#sleep(3)
except:
print("No records found " + job)
pass
sleep(3)
The website contains API
and you can pull all the required data along with all the pages from API as get method as json data format using requests
module only. Why not the easiest and the robust way?
Example:
import pandas as pd
import requests
params = {
'siteKey': 'AU-Main',
'sourcesystem': 'houston',
'userqueryid': '7cb7b6cba52d0fc86c4955905aa0d15f-1359713',
'userid': '0b604fc99bf7444eeda28ffaf2e10f06',
'usersessionid': '0b604fc99bf7444eeda28ffaf2e10f06',
'eventCaptureSessionId': '0b604fc99bf7444eeda28ffaf2e10f06',
'where': 'All Perth WA',
'page': '2',
'seekSelectAllPages': 'true',
'keywords': 'data',
'classification': '6281',
'hadPremiumListings': 'true',
'include': 'seodata',
'solId': '37215a53-00ec-496c-90b5-00228abf7cd7'
}
api_url= 'https://www.seek.com.au/api/chalice-search/search'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
data=[]
for params['page'] in range(1,22):
res=requests.get(api_url,headers=headers)
#print(res)
for item in res.json()['data']:
title = item['title']
data.append({'title':title})
#print(data
df= pd.DataFrame(data)
print(df)
Output:
title
0 Front Desk Receptionist
1 Customer Service & Sales Superstar (hybrid/wor...
2 Academy Receptionist
3 Customer Service Administrator - Work from Hom...
4 Kennel Attendant
.. ...
415 Veterinary Receptionist
416 Veterinary Receptionist
417 Bar, Catering & baristas staff required - No E...
418 headspace Receptionist
419 Markerting / Office Administration Support
[420 rows x 1 columns]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.