How to scraping iframe using selenium?

问题

I want to extract all comment in a website. The website using iframe for the comment section. I already tried to scrap it using selenium. but unfortunaly, i just can scrap 1 comment. How to scrap the rest of the comment and archive it to csv or xmls?

Code :

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    driver = webdriver.Chrome()
    page = driver.get("https://finance.detik.com/berita-ekonomi-bisnis/d-5307853/ri-disebut-punya-risiko-korupsi-yang-tinggi?_ga=2.13736693.357978333.1608782559-293324864.1608782559")
    
    iframe = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, "//iframe[@class='xcomponent-component-frame xcomponent-visible']")))
    driver.switch_to.frame(iframe)
    
    xpath = '//*[@id="cmt66363941"]/div[1]/div[1]'
    extract_name = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, xpath)))
    username=extract_name.text
    
    xpath = '//*[@id="cmt66363941"]/div[1]/div[2]'
    extract_comment = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, xpath)))
    comment=extract_comment.text
    
    print(username, comment)

Output

    King Akbarmachinery
    3 hari yang lalu selama korupsi tidak dihukum mati disanalah korupsi masih liar dan ada kalaupun dibuat hukum mati setidaknya bisa mengurangi angka korupsi itu
    Laporkan
    2BalasBagikan:

by the way, how to erase this line from the output ?

Laporkan
2BalasBagikan:

回答1:

You should generalize your paths in order to grab all the users and all comments at the same time. You can grab all the comments and all the users using presence_of_all_elements_located

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()
page = driver.get(
    "https://finance.detik.com/berita-ekonomi-bisnis/d-5307853/ri-disebut-punya-risiko-korupsi-yang-tinggi?_ga=2.13736693.357978333.1608782559-293324864.1608782559")

iframe = WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.XPATH, "//iframe[@class='xcomponent-component-frame xcomponent-visible']")))
driver.switch_to.frame(iframe)

xpath_users = "//div[contains(@class, 'comment__cmt_dk_name___EGuzI ')]"
extract_names = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, xpath_users)))

xpath_comments = "//div[contains(@class, 'comment__cmt_box_text')]"
extract_comments = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, xpath_comments)))

for user, comment in zip(extract_names, extract_comments):
    user = user.text.split("\n")[0]
    comment = comment.text.split("\n")[0]
    print(user, comment)

回答2:

This is how you can achieve the same using requests module issuing post requests with appropriate parameters which should fetch you the content across all the pages.

import requests
from urllib.parse import unquote

url = 'https://apicomment.detik.com/graphql'
payload = {"query":"query search($type: String!, $size: Int!,$anchor: Int!, $sort: String!, $adsLabelKanal: String, $adsEnv: String, $query: [ElasticSearchAggregation]) {\nsearch(type: $type, size: $size,page: $anchor, sort: $sort,adsLabelKanal: $adsLabelKanal, adsEnv: $adsEnv, query: $query){\npaging sorting counter counterparent profile hits {\nposisi hasAds results {\n id author content like prokontra  status news create_date pilihanredaksi refer liker { id } reporter { id status_report } child { id child parent author content like prokontra status create_date pilihanredaksi refer liker { id } reporter { id status_report } authorRefer } } } }}","variables":{"type":"comment","sort":"newest","size":10,"anchor":1,"query":[{"name":"news.artikel","terms":5307853},{"name":"news.site","terms":"dtk"}],"adsLabelKanal":"detik_finance","adsEnv":"desktop"}}

while True:
    r = requests.post(url,json=payload)
    container = r.json()['data']['search']['hits']['results']
    if not container:
        break
    else:
        for item in container:
            if not len(item['author']):continue
            print(item['author']['name'],unquote(item['content']))

    payload['variables']['anchor']+=1

来源：https://stackoverflow.com/questions/65509731/how-to-scraping-iframe-using-selenium

标签

python

selenium

web-scraping

iframe