note: ipums international and ipums usa probably use the same system. ipums usa allows quicker signup. if you would like to test out your code, try https://usa.ipums.org/usa-a
@HubertL have done many steps in the right direction, however, I think, his answer is not complete.
First of all, the main thing to look at when you're implementing automatic web authorization is the cookies being used during 'normal' manual workflow. You can easily spy on them with dev tools in any modern browser:
Here, we see JSESSIONID
and _shibsession*
cookies, first one holds JSP session id of the website, second is most likely solely for a shibboleth authorization. Server is, probably, have them bound somehow, but JSESSIONID
doesn't require authorization and you get it right away after opening the website. So, we must get _shibsession*
cookie for our JSESSIONID
to be authorized. That's what the Shibboleth's authorization process with many redirects is about. See the comments in code.
login_ipums = function(user, password)
{
require(httr)
require(rvest)
set_config( config( ssl_verifypeer = 0L ) )
#important - httr preserves cookies on subsequent requests to the same host, we don't need that because of sessions expiration
handle_reset("https://usa.ipums.org/")
#set login and password
login1 = GET( "https://usa.ipums.org/usa-action/users/login" )
form_auth = list( "j_username" = user , "j_password" = password )
l1_cookies=login1$cookies$value
names(l1_cookies)=login1$cookies$name
#receive auth tokens as html hidden fields in a form
login2 = POST(login1$url, body = form_auth, set_cookies(.cookies=l1_cookies), encode="form")
login2_form = read_html(login2$content) %>% html_form()
l2_cookies=login2$cookies$value
names(l2_cookies)=login2$cookies$name
#submit the form back (browser submits it back automatically with JS)
login3 = POST(login2_form[[1]]$url, body=list(RelayState=login2_form[[1]]$fields$RelayState$value,
SAMLResponse=login2_form[[1]]$fields$SAMLResponse$value),
set_cookies(.cookies=l2_cookies),
encode="form")
#now we have what we came for - _shibsession_* and JSESSION id cookie
login_cookies = login3$cookies$value
names(login_cookies)=login3$cookies$name
return=login_cookies
}
After the call to login_ipums
we'll have the following cookies:
> cookies=login_ipums(my_email, my_password)
> names(cookies)
[1] "JSESSIONID"
[2] "_idp_authn_lc_key"
[3] "_shibsession_7573612e69..."
Here, we have both JSESSIONID
and _shibsession_*
used for site-wide authorization. _idp_authn_lc_key
is, probably, not needed, but leaving it won't hurt.
Now, you can easily download files like that:
cookies=login_ipums(my_email, my_password)
target = GET("https://usa.ipums.org/usa-action/downloads/extract_files/usa_00001.dat.gz",
set_cookies(.cookies=cookies),
write_disk("file.bin", overwrite = TRUE))
IMPORTANT NOTE: As you can see, I used IPUMS USA, not International. To check that code with your account, replace usa
with international
everywhere, including *-action
in URLs.
You have to use set_cookies()
to send your cookies to the server:
library(httr)
library(rvest)
#my_email <- "xxx"
#my_password <- "yyy"
tf <- tempfile()
set_config( config( ssl_verifypeer = 0L ) )
# Get first page
p1 <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) )
# Post Login credentials
b2 <- list( "j_username" = my_email , "j_password" = my_password )
c2 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
`_idp_authn_lc_key`=p1$cookies[p1$cookies$domain=="live.identity.popdata.org",]$value)
p2 <- POST(p1$url,body = b2, set_cookies(.cookies = c2), encode="form" )
# Parse hidden fields
h2 <- read_html(p2$content)
form <- h2 %>% html_form()
# Post hidden fields
b3 <- list( "RelayState"=form[[1]]$fields[[1]]$value, "SAMLResponse"=form[[1]]$fields[[2]]$value)
c3 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
`_idp_session`=p2$cookies[p2$cookies$name=="_idp_session",]$value,
`_idp_authn_lc_key`=p2$cookies[p2$cookies$name=="_idp_authn_lc_key",]$value)
p3 <- POST( form[[1]]$url , body=b3, set_cookies(.cookies = c3), encode = "form")
# Get interesting page
c4 <- c(JSESSIONID=p3$cookies[p1$cookies$domain=="international.ipums.org" && p3$cookies$name=="JSESSIONID",]$value,
`_idp_session`=p3$cookies[p3$cookies$name=="_idp_session",]$value,
`_idp_authn_lc_key`=p3$cookies[p3$cookies$name=="_idp_authn_lc_key",]$value)
p4 <- GET( "https://international.ipums.org/international-action/menu", set_cookies(.cookies = c4) )
writeBin(p4$content , tf )
readLines( tf )[55]
Since the result is
[1] " <li class=\"lastItem\"><a href=\"/international-action/users/logout\">Logout</a></li>"
I think you're logged in...