I can see the correct data, but cannot put it on a Data Frame (It appears as a list of elements). I think the problem is my understanding of the apply family functions. Any hint is welcome.
Here is a similar question, but I think it is better to post mine as it contains more details:
Webscraping content across multiple pages using rvest package
library(rvest)
library(lubridate)
library(dplyr)
urls <- list("http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv",
"http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv?page=2&orderBy=seq",
"http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv?page=3&orderBy=seq",
"http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv?page=4&orderBy=seq")
sapply(urls, function(url){
product_info <- function(node){
r.precio.antes <- html_nodes(node, 'span.catalog-product-list-price') %>% html_text
r.precio.actual <- html_nodes(node, 'span.catalog-product-offer-price') %>% html_text
r.producto <- html_nodes(node,"span.catalog-product-name") %>% html_text
data.frame(
periodo = year(Sys.Date()),
fecha = Sys.Date(),
ecommerce = "ripley",
producto = r.producto,
precio.antes = ifelse(length(r.precio.antes)==0, NA, r.precio.antes),
precio.actual = ifelse(length(r.precio.actual)==0, NA, r.precio.actual),
#tarjeta.ripley = ifelse(length(r.tarjeta)==0, NA, r.tarjeta),
stringsAsFactors=F
)
}
doc <- read_html(iconv(url), to="UTF-8") %>%
html_nodes("div.product-description")
do.call(rbind,lapply(doc, product_info) %>%
bind_rows())
})
I'm getting a list of results, but I need a Data Frame.
[[1]]
[,1]
periodo "2017"
fecha "2017-05-15"
ecommerce "ripley"
producto "LG SMART TV 43'' UHD 43UH6030 "
precio.antes "Normal: S/ 2,199.00"
precio.actual "Internet: S/ 1,599.00"
[,2]
periodo "2017"
fecha "2017-05-15"
ecommerce "ripley"
producto "SAMSUNG SMART TV UHD 40\" 40KU6000 "
precio.antes "Normal: S/ 2,499.00"
precio.actual "Internet: S/ 1,599.00"
[,3]
periodo "2017"
fecha "2017-05-15"
ecommerce "ripley"
producto "SAMSUNG SMART TV UHD 50\" 50KU6000 "
precio.antes "Normal: S/ 3,299.00"
precio.actual "Internet: S/ 1,799.00"
Edit: sessionInfo()
> sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=Spanish_Peru.1252
[2] LC_CTYPE=Spanish_Peru.1252
[3] LC_MONETARY=Spanish_Peru.1252
[4] LC_NUMERIC=C
[5] LC_TIME=Spanish_Peru.1252
attached base packages:
[1] stats graphics grDevices utils datasets
[6] methods base
other attached packages:
[1] rvest_0.3.2 xml2_1.1.1 dplyr_0.5.0
[4] purrr_0.2.2 readr_1.1.1 tidyr_0.6.1
[7] tibble_1.3.0 ggplot2_2.2.1.9000 tidyverse_1.1.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.10 compiler_3.4.0 cellranger_1.1.0
[4] plyr_1.8.4 forcats_0.2.0 tools_3.4.0
[7] jsonlite_1.4 lubridate_1.6.0 gtable_0.2.0
[10] nlme_3.1-131 lattice_0.20-35 psych_1.7.5
[13] DBI_0.6-1 curl_2.6 parallel_3.4.0
[16] haven_1.0.0 stringr_1.2.0 httr_1.2.1
[19] hms_0.3 grid_3.4.0 R6_2.2.0
[22] XML_3.98-1.7 readxl_1.0.0 foreign_0.8-67
[25] selectr_0.3-1 reshape2_1.4.2 modelr_0.1.0
[28] magrittr_1.5 scales_0.4.1 assertthat_0.2.0
[31] mnormt_1.5-5 colorspace_1.3-2 stringi_1.1.5
[34] lazyeval_0.2.0 munsell_0.4.3 broom_0.4.2
purrr::map_df
is a version of lapply
that coerces the result to a data.frame, which lets you do
library(tidyverse)
library(rvest)
urls <- list("http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv",
"http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv?page=2&orderBy=seq",
"http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv?page=3&orderBy=seq",
"http://simple.ripley.com.pe/tv-y-video/televisores/ver-todo-tv?page=4&orderBy=seq")
h <- urls %>% map(read_html) # scrape once, parse as necessary
h %>% map_df(~{
r.precio.antes <- html_nodes(.x, 'span.catalog-product-list-price') %>% html_text
r.precio.actual <- html_nodes(.x, 'span.catalog-product-offer-price') %>% html_text
data_frame(
periodo = lubridate::year(Sys.Date()),
fecha = Sys.Date(),
ecommerce = "ripley",
producto = html_nodes(.x, "span.catalog-product-name") %>% html_text,
precio.antes = ifelse(length(r.precio.antes) == 0, NA, r.precio.antes),
precio.actual = ifelse(length(r.precio.actual) == 0, NA, r.precio.actual),
)})
#> # A tibble: 85 x 6
#> periodo fecha ecommerce producto
#> <dbl> <date> <chr> <chr>
#> 1 2017 2017-05-16 ripley LG SMART TV 43'' UHD 43UH6030
#> 2 2017 2017-05-16 ripley SAMSUNG SMART TV UHD 40" 40KU6000
#> 3 2017 2017-05-16 ripley SAMSUNG SMART TV UHD 50" 50KU6000
#> 4 2017 2017-05-16 ripley SAMSUNG SMART TV UHD 49" CURVO 49KU6300
#> 5 2017 2017-05-16 ripley SAMSUNG SMART TV UHD 55" CURVO 55KU6300
#> 6 2017 2017-05-16 ripley SAMSUNG SMART TV UHD 55" CURVO 55KU6500
#> 7 2017 2017-05-16 ripley SAMSUNG SMART TV UHD 65" CURVO 65KU6500
#> 8 2017 2017-05-16 ripley LG SMART TV UHD 49'' 49UH6500
#> 9 2017 2017-05-16 ripley LG SMART TV 55'' UHD 55UH6030
#> 10 2017 2017-05-16 ripley LG SMART TV OLED 4K 55" OLED55B6P
#> # ... with 75 more rows, and 2 more variables: precio.antes <chr>,
#> # precio.actual <chr>
Alternatively, to fix what the list of matrices in base R, where x
is the list resulting from the code above,
df <- as.data.frame(do.call(rbind, lapply(x, t)), stringsAsFactors = FALSE)
# or df <- as.data.frame(t(do.call(cbind, x)), stringsAsFactors = FALSE)
df[] <- lapply(df, type.convert, as.is = TRUE)
来源:https://stackoverflow.com/questions/43992203/r-webscraping-a-list-of-urls-to-get-a-dataframe