问题
I want to extract all comment in a website. The website using iframe for the comment section. I already tried to scrap it using selenium. but unfortunaly, i just can scrap 1 comment. How to scrap the rest of the comment and archive it to csv or xmls?
- Code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
page = driver.get("https://finance.detik.com/berita-ekonomi-bisnis/d-5307853/ri-disebut-punya-risiko-korupsi-yang-tinggi?_ga=2.13736693.357978333.1608782559-293324864.1608782559")
iframe = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, "//iframe[@class='xcomponent-component-frame xcomponent-visible']")))
driver.switch_to.frame(iframe)
xpath = '//*[@id="cmt66363941"]/div[1]/div[1]'
extract_name = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, xpath)))
username=extract_name.text
xpath = '//*[@id="cmt66363941"]/div[1]/div[2]'
extract_comment = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, xpath)))
comment=extract_comment.text
print(username, comment)
- Output
King Akbarmachinery
3 hari yang lalu selama korupsi tidak dihukum mati disanalah korupsi masih liar dan ada kalaupun dibuat hukum mati setidaknya bisa mengurangi angka korupsi itu
Laporkan
2BalasBagikan:
by the way, how to erase this line from the output ?
Laporkan
2BalasBagikan:
回答1:
You should generalize your paths in order to grab all the users and all comments at the same time. You can grab all the comments and all the users using presence_of_all_elements_located
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
page = driver.get(
"https://finance.detik.com/berita-ekonomi-bisnis/d-5307853/ri-disebut-punya-risiko-korupsi-yang-tinggi?_ga=2.13736693.357978333.1608782559-293324864.1608782559")
iframe = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//iframe[@class='xcomponent-component-frame xcomponent-visible']")))
driver.switch_to.frame(iframe)
xpath_users = "//div[contains(@class, 'comment__cmt_dk_name___EGuzI ')]"
extract_names = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, xpath_users)))
xpath_comments = "//div[contains(@class, 'comment__cmt_box_text')]"
extract_comments = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, xpath_comments)))
for user, comment in zip(extract_names, extract_comments):
user = user.text.split("\n")[0]
comment = comment.text.split("\n")[0]
print(user, comment)
回答2:
This is how you can achieve the same using requests module issuing post requests with appropriate parameters which should fetch you the content across all the pages.
import requests
from urllib.parse import unquote
url = 'https://apicomment.detik.com/graphql'
payload = {"query":"query search($type: String!, $size: Int!,$anchor: Int!, $sort: String!, $adsLabelKanal: String, $adsEnv: String, $query: [ElasticSearchAggregation]) {\nsearch(type: $type, size: $size,page: $anchor, sort: $sort,adsLabelKanal: $adsLabelKanal, adsEnv: $adsEnv, query: $query){\npaging sorting counter counterparent profile hits {\nposisi hasAds results {\n id author content like prokontra status news create_date pilihanredaksi refer liker { id } reporter { id status_report } child { id child parent author content like prokontra status create_date pilihanredaksi refer liker { id } reporter { id status_report } authorRefer } } } }}","variables":{"type":"comment","sort":"newest","size":10,"anchor":1,"query":[{"name":"news.artikel","terms":5307853},{"name":"news.site","terms":"dtk"}],"adsLabelKanal":"detik_finance","adsEnv":"desktop"}}
while True:
r = requests.post(url,json=payload)
container = r.json()['data']['search']['hits']['results']
if not container:
break
else:
for item in container:
if not len(item['author']):continue
print(item['author']['name'],unquote(item['content']))
payload['variables']['anchor']+=1
来源:https://stackoverflow.com/questions/65509731/how-to-scraping-iframe-using-selenium