问题
Here is mtcars data in the MonetDBLite database file.
library(MonetDBLite)
library(tidyverse)
library(DBI)
dbdir <- getwd()
con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
dbWriteTable(conn = con, name = "mtcars_1", value = mtcars)
data_mt <- con %>% tbl("mtcars_1")
I want to use dplyr mutate to create new variables and add (commit!) that to the database table? Something like
data_mt %>% select(mpg, cyl) %>% mutate(var = mpg/cyl) %>% dbCommit(con)
The desired output should be same when we do:
dbSendQuery(con, "ALTER TABLE mtcars_1 ADD COLUMN var DOUBLE PRECISION")
dbSendQuery(con, "UPDATE mtcars_1 SET var=mpg/cyl")
How can do that?
回答1:
Here's a couple of functions, create
and update.tbl_lazy
.
They respectively implement CREATE TABLE
, which was straightforward, and the ALTER TABLE
/UPDATE
pair which is much less so:
CREATE
create <- function(data,name){
DBI::dbSendQuery(data$src$con,
paste("CREATE TABLE", name,"AS", dbplyr::sql_render(data)))
dplyr::tbl(data$src$con,name)
}
example:
library(dbplyr)
library(DBI)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
copy_to(con, head(iris,3),"iris")
tbl(con,"iris") %>% mutate(Sepal.Area= Sepal.Length * Sepal.Width) %>% create("iris_2")
# # Source: table<iris_2> [?? x 6]
# # Database: sqlite 3.22.0 []
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Area
# <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
# 1 5.1 3.5 1.4 0.2 setosa 17.8
# 2 4.9 3 1.4 0.2 setosa 14.7
# 3 4.7 3.2 1.3 0.2 setosa 15.0
UPDATE
update.tbl_lazy <- function(.data,...,new_type="DOUBLE PRECISION"){
quos <- rlang::quos(...)
dots <- rlang::exprs_auto_name(quos, printer = tidy_text)
# extract key parameters from query
sql <- dbplyr::sql_render(.data)
con <- .data$src$con
table_name <-gsub(".*?(FROM (`|\")(.+?)(`|\")).*","\\3",sql)
if(grepl("\nWHERE ",sql)) where <- regmatches(sql, regexpr("WHERE .*",sql))
else where <- ""
new_cols <- setdiff(names(dots),colnames(.data))
# Add empty columns to base table
if(length(new_cols)){
alter_queries <- paste("ALTER TABLE",table_name,"ADD COLUMN",new_cols,new_type)
purrr::walk(alter_queries, ~{
rs <- DBI::dbSendStatement(con, .)
DBI::dbClearResult(rs)})}
# translate unevaluated dot arguments to SQL instructions as character
translations <- purrr::map_chr(dots, ~ translate_sql(!!! .))
# messy hack to make translations work
translations <- gsub("OVER \\(\\)","",translations)
# 2 possibilities: called group_by or (called filter or called nothing)
if(identical(.data$ops$name,"group_by")){
# ERROR if `filter` and `group_by` both used
if(where != "") stop("Using both `filter` and `group by` is not supported")
# Build aggregated table
gb_cols <- paste0('"',.data$ops$dots,'"',collapse=", ")
gb_query0 <- paste(translations,"AS", names(dots),collapse=", ")
gb_query <- paste("CREATE TABLE TEMP_GB_TABLE AS SELECT",
gb_cols,", ",gb_query0,
"FROM", table_name,"GROUP BY", gb_cols)
rs <- DBI::dbSendStatement(con, gb_query)
DBI::dbClearResult(rs)
# Delete temp table on exit
on.exit({
rs <- DBI::dbSendStatement(con,"DROP TABLE TEMP_GB_TABLE")
DBI::dbClearResult(rs)
})
# Build update query
gb_on <- paste0(table_name,'."',.data$ops$dots,'" = TEMP_GB_TABLE."', .data$ops$dots,'"',collapse=" AND ")
update_query0 <- paste0(names(dots)," = (SELECT ", names(dots), " FROM TEMP_GB_TABLE WHERE ",gb_on,")",
collapse=", ")
update_query <- paste("UPDATE", table_name, "SET", update_query0)
rs <- DBI::dbSendStatement(con, update_query)
DBI::dbClearResult(rs)
} else {
# Build update query in case of no group_by and optional where
update_query0 <- paste(names(dots),'=',translations,collapse=", ")
update_query <- paste("UPDATE", table_name,"SET", update_query0,where)
rs <- DBI::dbSendStatement(con, update_query)
DBI::dbClearResult(rs)
}
tbl(con,table_name)
}
example 1, define 2 new numeric columns :
tbl(con,"iris") %>% update(x=pmax(Sepal.Length,Sepal.Width),
y=pmin(Sepal.Length,Sepal.Width))
# # Source: table<iris> [?? x 7]
# # Database: sqlite 3.22.0 []
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species x y
# <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2 setosa 5.1 3.5
# 2 4.9 3 1.4 0.2 setosa 4.9 3
# 3 4.7 3.2 1.3 0.2 setosa 4.7 3.2
example 2, modify an existing column, create 2 new columns of different types :
tbl(con,"iris") %>%
update(x= Sepal.Length*Sepal.Width,
z= 2*y,
a= Species %||% Species,
new_type = c("DOUBLE","VARCHAR(255)"))
# # Source: table<iris> [?? x 9]
# # Database: sqlite 3.22.0 []
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species x y z a
# <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
# 1 5.1 3.5 1.4 0.2 setosa 17.8 3.5 7 setosasetosa
# 2 4.9 3 1.4 0.2 setosa 14.7 3 6 setosasetosa
# 3 4.7 3.2 1.3 0.2 setosa 15.0 3.2 6.4 setosasetosa
example 3, update where:
tbl(con,"iris") %>% filter(Sepal.Width > 3) %>% update(a="foo")
# # Source: table<iris> [?? x 9]
# # Database: sqlite 3.22.0 []
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species x y z a
# <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
# 1 5.1 3.5 1.4 0.2 setosa 17.8 3.5 7 foo
# 2 4.9 3 1.4 0.2 setosa 14.7 3 6 setosasetosa
# 3 4.7 3.2 1.3 0.2 setosa 15.0 3.2 6.4 foo
example 4 : update by group
tbl(con,"iris") %>%
group_by(Species, Petal.Width) %>%
update(new_col1 = sum(Sepal.Width,na.rm=TRUE), # using a R function
new_col2 = MAX(Sepal.Length)) # using native SQL
# # Source: SQL [?? x 11]
# # Database: sqlite 3.22.0 []
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species x y z a new_col1 new_col2
# <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2 setosa 1 2 7 foo 6.5 5.1
# 2 4.9 3 1.4 0.2 setosa 1 2 6 setosasetosa 6.5 5.1
# 3 7 3.2 4.7 1.4 versicolor 1 2 6.4 foo 3.2 7
GENERAL NOTES
The code uses uses
dbplyr::translate_sql
so we can use R functions or native ones alike just like in good oldmutate
calls.update
can only be used after onefilter
call OR onegroup_by
call OR zero of each, anything else and you'll get an error or unexpected results.The
group_by
implementation is VERY hacky, so no room for defining columns on the fly or grouping by an operation, stick to the basics.update
andcreate
both returntbl(con, table_name)
, which means you can chain as manycreate
orupdate
calls as you wish, with the appropriate amount ofgroup_by
andfilter
in between. In fact all of my 4 examples can be chained.To hammer the nail,
create
doesn't suffer from the same restrictions, you can have as muchdbplyr
fun as desired before calling it.I didn't implement type detection, so I needed the
new_type
parameter, it is recycled in thepaste
call of thealter_queries
definition in my code so it can be a single value or a vector.
One way to solve the latter would be to extract the variables from the translations
variable, find their types in dbGetQuery(con,"PRAGMA table_info(iris)")
. Then we need coercion rules between all existing types, and we're set. But as different DBMS have different types I can't think of a general way to do it, and I don't know MonetDBLite
.
来源:https://stackoverflow.com/questions/50766908/mutate-variables-in-database-tables-directly-using-dplyr