How to scrape all Steam id, review content, profile_url from reviews of a game in steam into excel file using python?

≡放荡痞女 提交于 2021-01-29 18:04:32

问题


#The error is either it prints only first 11 reviews (when while n<500 is used) or does not print at all(when while True: is used). Requirement is to save all Steam id, review content, profile_url from reviews of the game into excel.

from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.keys import Keys
import re
from time import sleep
from datetime import datetime
from openpyxl import Workbook

game_id= 1097150
url = 'https://steamcommunity.com/app/1097150/positivereviews/?p=1&browsefilter=trendweek&filterLanguage=english'

options = EdgeOptions()
options.use_chromium = True
driver = Edge(options=options)
driver.get(url)

#The page is continously scrolling, and scrapping begins

last_position = driver.execute_script("return window.pageYOffset;")
reviews = []
review_ids = set()

while True:
  cards = driver.find_elements_by_class_name('apphub_Card')
  for card in cards[-20:]:
    profile_url = card.find_element_by_xpath('.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')
    steam_id = profile_url.split('/')[-2]
    date_posted = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]/div').text
    review_content = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()  
     
    review = (steam_id, profile_url, review_content)
    reviews.append(review) 
  
  attempt_count = 0
  while attempt_count < 3:
       driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
       curr_position = driver.execute_script("return window.pageYOffset;")
       
       if curr_position == last_position:
             attempt_count += 1
             sleep(0.5)
         else:
             break
driver.close()

#to save the results

wb = Workbook()
ws = wb.worksheets[0]
ws.append(['SteamId', 'ProfileURL', 'ReviewText'])
for row in reviews:
    ws.append(row)
    
today = datetime.today().strftime('%Y%m%d')    
wb.save(f'Steam_Reviews_{game_id}_{today}.xlsx')    
wb.close()

回答1:


Here's how to infinitely scroll down or until 500 elements in your case.

while True:
  cards = driver.find_elements_by_class_name('apphub_Card')
  if(len(cards)>=500):
      break
  last_position = driver.execute_script("return window.pageYOffset;")
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  time.sleep(1)
  curr_position = driver.execute_script("return window.pageYOffset;")
  if(last_position==curr_position):
      break
    
for card in cards[:500]:
    profile_url = card.find_element_by_xpath('.//div[@class="apphub_friend_block"]/div/a[2]').get_attribute('href')
    steam_id = profile_url.split('/')[-2]
    date_posted = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]/div').text
    review_content = card.find_element_by_xpath('.//div[@class="apphub_CardTextContent"]').text.replace(date_posted,'').strip()  
    review = (steam_id, profile_url, review_content)
    reviews.append(review)


来源:https://stackoverflow.com/questions/64261087/how-to-scrape-all-steam-id-review-content-profile-url-from-reviews-of-a-game-i

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!