I am trying to scrape information about applicants for jobs but after s.post(login_url, data=payload)
the session gets reset and the program no longer has access to the website content. I have tested it with just the logging in and it works fine, but when I try to access interviews_url = ('https://www.sparkhire.com/company/interviews')
it logs me out. Am I doing something wrong?
from bs4 import BeautifulSoup
import requests
import token_scraper
login_url = ('https://www.sparkhire.com/login')
interviews_url = ('https://www.sparkhire.com/company/interviews')
payload = {
'_token':token_scraper.token,
'email':'censored',
'password':'censored'
}
with requests.session() as s:
s.post(login_url, data=payload)
r = s.get(interviews_url)
soup = BeautifulSoup(s.content, 'html.parser')
print(soup)
from bs4 import BeautifulSoup
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0'
}
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = get_soup(r.text)
data = {
"_token": soup.select_one('input[name=_token]')['value'],
"email": "any@any.com",
"password": "yourpass"
}
req.post(url, data=data)
r = req.get('https://www.sparkhire.com/company/interviews')
with open('view.html', 'wb') as f:
f.write(r.content)
main('https://www.sparkhire.com/login')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.