I've been trying to obtain contact details - emails from my connections on LinkedIn. Unfortunately, the code does not progress beyond scrolling through my connection list. Any ideas?
import re
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import time
import random
path = "C:\\Program Files (x86)\\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get("https://www.linkedin.com")
username = driver.find_element(By.ID, 'session_key')
username.send_keys('')
password = driver.find_element(By.ID, 'session_password')
password.send_keys('')
log_in_button = driver.find_element(By.CLASS_NAME, 'sign-in-form__submit-button')
log_in_button.click()
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
total_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.uniform(2.5, 4.9))
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == total_height:
break
last_height = new_height
page = bs(driver.page_source, 'html.parser')
content = page.find_all('a', {'class': "mn-connection-card__link ember-view"})
mynetwork = []
for contact in content:
mynetwork.append(contact.get('href'))
print(len(mynetwork), " connections")
my_network_emails = []
for contact in mynetwork:
driver.get("https://www.linkedin.com" + contact + "detail/contact-info/")
driver.implicitly_wait(3)
contact_page = bs(driver.page_source, 'html.parser')
content_contact_page = contact_page.find_all('a', href=re.compile("mailto"))
for contact in content_contact_page:
print("[+]", contact.get('href')[7:])
my_network_emails.append(contact.get('href')[7:])
# wait few seconds before to connect to the next profile
time.sleep(random.uniform(0.5, 1.9))
content_contact_page = contact_page.find_all('a', href=re.compile("mailto"))
for contact in content_contact_page:
print("[+]", contact.get('href')[7:])
my_network_emails.append(contact.get('href')[7:])
with open(f'network_emails.csv', 'w') as f:
writer = csv.writer(f)
for email in my_network_emails:
writer.writerow([email])
First of all, I should probably mention that this might be in violation of Linkedin's TOS. Second, I would also suggest to not do page = bs(driver.page_source, 'html.parser')
, but rather scrape the information directly from the browser using selenium. Why parse the page twice?
That said, there's probably something happening in that scroll loop. Can you insert print functions to see how new_height and total_height change when scrolling? Does the page load new results when scrolling to the bottom? If yes, there might be a need to insert a wait there. What I think is happening (without working with Linkedin before), is that document.body.scrollHeight changes when new results are loaded, and at some point new_height becomes more than total_height. That can be seen when via the print functions I mentioned above.
If that's indeed the case, there are 2 things I think can help you:
page_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.uniform(2.5, 4.9))
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == page_height:
break
page_height = new_height
Your code seems to try to do this already, but you seem to have a typo where you set not the total_height variable, but last_height, but then compare with total_height. I think that this typo is the final reason for the bug. However, I would like to suggest another, more general technique:
def find_all_links():
all_links = set()
while True:
scroll_page_and_wait()
page_links = set(gather_page_links()) #page.find_all('a', {'class': "mn-connection-card__link ember-view"}) etc
len1 = len(all_links)
all_links |= page_links
if len(all_links) == len1:
break
return all_links
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.