简体   繁体   中英

How to scrape iframe using selenium?

I want to extract all comment in a website. The website using iframe for the comment section. I already tried to scrape it using selenium. but unfortunaly, i just can scrape 1 comment. How to scrape the rest of the comment and archive it to csv or xmls?

  • Code:
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    driver = webdriver.Chrome()
    page = driver.get("https://finance.detik.com/berita-ekonomi-bisnis/d-5307853/ri-disebut-punya-risiko-korupsi-yang-tinggi?_ga=2.13736693.357978333.1608782559-293324864.1608782559")
    
    iframe = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, "//iframe[@class='xcomponent-component-frame xcomponent-visible']")))
    driver.switch_to.frame(iframe)
    
    xpath = '//*[@id="cmt66363941"]/div[1]/div[1]'
    extract_name = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, xpath)))
    username=extract_name.text
    
    xpath = '//*[@id="cmt66363941"]/div[1]/div[2]'
    extract_comment = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, xpath)))
    comment=extract_comment.text
    
    print(username, comment)
  • Output
    King Akbarmachinery
    3 hari yang lalu selama korupsi tidak dihukum mati disanalah korupsi masih liar dan ada kalaupun dibuat hukum mati setidaknya bisa mengurangi angka korupsi itu
    Laporkan
    2BalasBagikan:

by the way, how to erase this line from the output?

Laporkan
2BalasBagikan:

You should generalize your paths in order to grab all the users and all comments at the same time. You can grab all the comments and all the users using presence_of_all_elements_located

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()
page = driver.get(
    "https://finance.detik.com/berita-ekonomi-bisnis/d-5307853/ri-disebut-punya-risiko-korupsi-yang-tinggi?_ga=2.13736693.357978333.1608782559-293324864.1608782559")

iframe = WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.XPATH, "//iframe[@class='xcomponent-component-frame xcomponent-visible']")))
driver.switch_to.frame(iframe)

xpath_users = "//div[contains(@class, 'comment__cmt_dk_name___EGuzI ')]"
extract_names = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, xpath_users)))

xpath_comments = "//div[contains(@class, 'comment__cmt_box_text')]"
extract_comments = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, xpath_comments)))

for user, comment in zip(extract_names, extract_comments):
    user = user.text.split("\n")[0]
    comment = comment.text.split("\n")[0]
    print(user, comment)

This is how you can achieve the same using requests module issuing post requests with appropriate parameters which should fetch you the content across all the pages.

import requests
from urllib.parse import unquote

url = 'https://apicomment.detik.com/graphql'
payload = {"query":"query search($type: String!, $size: Int!,$anchor: Int!, $sort: String!, $adsLabelKanal: String, $adsEnv: String, $query: [ElasticSearchAggregation]) {\nsearch(type: $type, size: $size,page: $anchor, sort: $sort,adsLabelKanal: $adsLabelKanal, adsEnv: $adsEnv, query: $query){\npaging sorting counter counterparent profile hits {\nposisi hasAds results {\n id author content like prokontra  status news create_date pilihanredaksi refer liker { id } reporter { id status_report } child { id child parent author content like prokontra status create_date pilihanredaksi refer liker { id } reporter { id status_report } authorRefer } } } }}","variables":{"type":"comment","sort":"newest","size":10,"anchor":1,"query":[{"name":"news.artikel","terms":5307853},{"name":"news.site","terms":"dtk"}],"adsLabelKanal":"detik_finance","adsEnv":"desktop"}}

while True:
    r = requests.post(url,json=payload)
    container = r.json()['data']['search']['hits']['results']
    if not container:
        break
    else:
        for item in container:
            if not len(item['author']):continue
            print(item['author']['name'],unquote(item['content']))

    payload['variables']['anchor']+=1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM