I have a wine dataset with a column called \"title\" which contains the title of the wine including its vintage year. Refer sample:
you can use sub()
or regexec()
from base by searching for numbers with have 4 digits:
string <- c('R2 2013 Camp 4 Vineyard Grenache Blanc', 'Santa Ynez Valley 1999', 'dsdd 2015')
sub("^.*([0-9]{4}).*", "\\1", string)
unlist(regmatches(string, regexec("[0-9]{4}", string)))
for your case:
# create a helper function
yearExtract <- function(string) {
t <- regmatches(string, regexec("[0-9]{4}", string))
sapply(t, function(x) {
if(length(x) > 0){
return(as.numeric(x))
} else {
return(NA)
}
})
}
# create data.frame
title <- c('R2 2013 Camp 4 Vineyard Grenache Blanc', 'Santa Ynez Valley 1999', 'dsdd 15')
distributor <- c('a', 'b', 'd')
wine_tidy2 <- data.frame(title, distributor)
wine_tidy2$vintage_year <- yearExtract(as.character(wine_tidy2$title))