问题
Please find below my data ( rows are disease group 0= control, 1=Ulcerative Colitis and 2=Crohns), columns are gene expression values.
structure(c(5.54312e-05, 5.6112e-06, 9.74312e-05, 1.3612e-06,
1.29312e-05, 7.2512e-06, 0.0002159302, 3.6312e-06, 0.0001467552,
1.53312e-05, 0.0009132182, 1.9312e-06, 0.0074214952, 0.0006480372,
5.1312e-06, 6.1812e-06, 4.7612e-06, 0.0001199302, 0.0008845182,
0.0008506632, 0.0002366382, 7.3912e-06, 8.5112e-06, 2.63312e-05,
0.0013685242, 1.12312e-05, 0.0001775992, 0.0063385632, 0.0061628972,
0.0406951632, 0.0132550862, 0.0330866502, 0.0741588422, 0.0049675282,
0.0124742612, 0.0432014482, 0.0114703162, 0.0384477822, 0.0188251552,
0.0277018382, 0.0633737932, 0.0053745442, 0.0488762832, 0.0099598792,
0.0044341092, 0.0041768872, 0.0152889442, 0.0602260842, 0.0512892512,
0.0065575852, 0.0174603572, 0.0076848152, 0.0021076082, 0.0057732232,
0.0761864242, 0.0376310742, 0.0521594242, 0.0121793962, 0.0471997972,
0.0224588692, 0.0302616442, 0.0062663212, 0.0286649272, 0.0228584812,
0.0280185812, 0.0176817072, 0.0405636232, 0.0297912062, 0.0347780872,
0.0193185042, 0.0118479432, 0.0096142082, 0.0640275732, 0.0353341802,
0.0416389862, 0.0560150452, 0.0330486812, 0.0176602362, 0.0301871972,
0.0579195622, 0.0299905202, 0.0001129152, 0.0009209172, 0.0010817792,
0.0001951902, 0.0016784762, 0.0001716432, 0.0001917332, 0.0005600662,
0.0003840872, 0.0004548142, 0.0007234162, 0.0002039282, 0.0009733682,
0.0008222022, 0.0006205572, 0.0002608002, 0.0002146382, 0.0020774742,
0.0006584612, 0.0004037032, 0.0003786822, 0.0004093372, 0.0017226182,
0.0002138162, 0.0001766742, 0.0020229092, 0.0018869602, 0.0530292672,
0.0225949962, 0.0119676672, 0.0268511442, 0.0377380112, 0.0313562992,
0.1041032912, 0.0632652472, 0.0180284852, 0.1160380322, 0.0057282012,
0.0536359992, 0.0591269722, 0.0118352722, 0.0046396552, 0.0143029422,
0.0829488842, 0.0152022692, 0.0212954622, 0.0433420312, 0.0081537062,
0.0156137782, 0.0432896402, 0.0488343522, 0.0191447942, 0.0598099022,
0.0069907162, 0.0408296912, 0.0298613812, 0.0614052022, 0.0061426502,
0.0097676332, 0.0354280242, 0.0372933212, 0.0130974212, 0.0022172112,
0.0402114242, 0.0063038722, 0.0301466432, 0.0320339102, 0.0245904292,
0.0779917522, 0.0172156972, 0.0147311782, 0.0480258512, 0.0316871712,
0.0324477412, 0.0322786442, 0.0173019162, 0.0134506982, 0.0402077862,
0.0426696462, 0.0345675212, 0.0346313502, 1.93312e-05, 4.7512e-06,
5.41312e-05, 3.12e-08, 1.91312e-05, 9.642e-07, 6.0112e-06, 3.12e-08,
1.0812e-06, 4.412e-07, 7.72312e-05, 4.382e-07, 0.0005851852,
0.0002470232, 7.8912e-06, 3.1612e-06, 2.1712e-06, 7.5912e-06,
9.29312e-05, 0.0001160552, 5.51312e-05, 7.8212e-06, 6.6812e-06,
2.0912e-06, 0.0001043732, 4.1912e-06, 1.27312e-05, 0.0001975332,
0.0001513812, 0.0001073372, 6.54312e-05, 0.0002255952, 0.0001426622,
0.0001689042, 3.50312e-05, 0.0003652732, 0.0001742852, 0.0003393582,
8.70312e-05, 0.0001367102, 0.0001566652, 0.0002242122, 0.0002053362,
8.87312e-05, 0.0003058052, 0.0001336462, 0.0001512112, 0.0001072602,
0.0001626102, 0.0001522802, 6.88312e-05, 0.0001138952, 0.0002492892,
0.0002425912, 0.0007929912, 0.0076409822, 0.0049373582, 0.0004223922,
0.0009535442, 0.0009512182, 0.0006713372, 0.0011064372, 0.0026065992,
0.0030068982, 0.0019116772, 0.0013541412, 0.0124617692, 0.0004349482,
0.0023764912, 0.0078575922, 0.0004369202, 0.0004881912, 0.0003481772,
0.0009314802, 0.0003240052, 0.0049453522, 0.0006938762, 0.0004796032,
0.0008434462, 0.0014197062, 0.0015475092, 8.16312e-05, 6.63312e-05,
0.0001016142, 3.08312e-05, 0.0001470702, 5.13312e-05, 0.0001095102,
2.39312e-05, 0.0002255062, 4.28312e-05, 0.0002308162, 2.10312e-05,
0.0001356312, 0.0001242042, 0.0002451592, 0.0002754772, 3.18312e-05,
0.0001751912, 0.0001802232, 0.0002467002, 0.0003787392, 4.35312e-05,
0.0002678552, 7.20312e-05, 7.65312e-05, 8.79312e-05, 0.0001300572,
0.0001114932, 3.17312e-05, 0.0002001272, 3.1512e-06, 8.75312e-05,
3.1412e-06, 6.9212e-06, 0.0001659672, 5.98312e-05, 0.0002013862,
5.9512e-06, 2.57312e-05, 2.53312e-05, 3.27312e-05, 0.0001374772,
0.0001344332, 6.172e-07, 3.90312e-05, 0.0188869402, 0.0503434972,
4.15312e-05, 1.67312e-05, 0.0001726452, 4.95312e-05, 1.27312e-05,
9.85312e-05, 4.28312e-05, 0.0027084332, 0.0032156172, 0.0045711912,
0.0017135802, 0.0243532152, 0.0066607792, 0.0031989182, 0.0030944172,
0.0047891942, 0.0028169862, 0.0215873442, 0.0020847562, 0.0037806512,
0.0217515262, 0.0090971742, 0.0122162562, 0.0011257962, 0.0130435652,
0.0055148042, 0.0083239932, 0.0268987952, 0.0021491662, 0.0080216542,
0.0066735982, 0.0053911702, 0.0185785902, 0.0137863282, 0.0008059812,
0.0012895362, 0.0024514472, 0.0002341382, 0.0016947642, 0.0002882062,
0.0023575092, 0.0008561602, 0.0015975512, 0.0001175692, 0.0001666122,
0.0003673192, 0.0010398722, 0.0017795592, 0.0004381232, 0.0010125462,
0.0005299672, 0.0031931172, 0.0025627332, 0.0027740412, 0.0030131672,
0.0013492282, 0.0016463272, 0.0011142532, 0.0012079132, 0.0028049802,
0.0003664502), .Dim = c(27L, 13L), .Dimnames = list(c("2", "0",
"0", "0", "1", "0", "0", "1", "1", "1", "2", "0", "0", "1", "2",
"2", "1", "2", "2", "2", "2", "1", "1", "2", "2", "0", "0"),
c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6", "Gene7",
"Gene8", "Gene9", "Gene10", "Gene11", "Gene12", "Gene13")))
I used the bestNormalize package to apply the Box-Cox transformation to the individual columns (when converting them to vectors) e.g.
values <- boxcox(data[, 1], standardize=T)
normvalues<- predict(values)
when inspecting the values in column 1 following Box-Cox transformation on the whole matrix using
process <-boxcox(data, standardize=T)
norm <- predict(process)
the values are not the same. Whilst neither achieves normality ( looking at histograms and ad.test), how do I know which is the correct one? I understand according to the help pages of bestNormalize boxcox(x, ...), x needs to be a vector.
If I need to apply the Box-Cox to each individual vector of data ( columns 1:13) rather than the whole matrix, I am lost as to how I would devise a for loop or use the apply function for this.
Any suggestions would be appreciated please. My attempt at currently non-functioning loops are below:
for(i in 1:ncol(data)){
normvalues <- apply(data[,i],
Margin=2,
FUN=function()
{process <- boxcox(data[, i], standardize=T)
normout <- predict(process[i])
print(normout)}
}
Or using pipes?
for(i in 1:ncol(stacknew)){
normalcheck5 <- stacknew[,i]
%>% boxcox()
%>% predict()
print(normalcheck5)
}
I need the output in a 27 x 13 matrix containing the values of the transformed gene expression where the transformation has been applied to each column vector individually. Any suggestions would be helpful please.
回答1:
I would solve your problem as follows.
# packages
library(bestNormalize)
library(tidyr)
library(ggplot2)
library(dplyr)
library(stringr)
# data
my_data <- structure(c(
5.54312e-05, 5.6112e-06, 9.74312e-05, 1.3612e-06,
1.29312e-05, 7.2512e-06, 0.0002159302, 3.6312e-06, 0.0001467552,
1.53312e-05, 0.0009132182, 1.9312e-06, 0.0074214952, 0.0006480372,
5.1312e-06, 6.1812e-06, 4.7612e-06, 0.0001199302, 0.0008845182,
0.0008506632, 0.0002366382, 7.3912e-06, 8.5112e-06, 2.63312e-05,
0.0013685242, 1.12312e-05, 0.0001775992, 0.0063385632, 0.0061628972,
0.0406951632, 0.0132550862, 0.0330866502, 0.0741588422, 0.0049675282,
0.0124742612, 0.0432014482, 0.0114703162, 0.0384477822, 0.0188251552,
0.0277018382, 0.0633737932, 0.0053745442, 0.0488762832, 0.0099598792,
0.0044341092, 0.0041768872, 0.0152889442, 0.0602260842, 0.0512892512,
0.0065575852, 0.0174603572, 0.0076848152, 0.0021076082, 0.0057732232,
0.0761864242, 0.0376310742, 0.0521594242, 0.0121793962, 0.0471997972,
0.0224588692, 0.0302616442, 0.0062663212, 0.0286649272, 0.0228584812,
0.0280185812, 0.0176817072, 0.0405636232, 0.0297912062, 0.0347780872,
0.0193185042, 0.0118479432, 0.0096142082, 0.0640275732, 0.0353341802,
0.0416389862, 0.0560150452, 0.0330486812, 0.0176602362, 0.0301871972,
0.0579195622, 0.0299905202, 0.0001129152, 0.0009209172, 0.0010817792,
0.0001951902, 0.0016784762, 0.0001716432, 0.0001917332, 0.0005600662,
0.0003840872, 0.0004548142, 0.0007234162, 0.0002039282, 0.0009733682,
0.0008222022, 0.0006205572, 0.0002608002, 0.0002146382, 0.0020774742,
0.0006584612, 0.0004037032, 0.0003786822, 0.0004093372, 0.0017226182,
0.0002138162, 0.0001766742, 0.0020229092, 0.0018869602, 0.0530292672,
0.0225949962, 0.0119676672, 0.0268511442, 0.0377380112, 0.0313562992,
0.1041032912, 0.0632652472, 0.0180284852, 0.1160380322, 0.0057282012,
0.0536359992, 0.0591269722, 0.0118352722, 0.0046396552, 0.0143029422,
0.0829488842, 0.0152022692, 0.0212954622, 0.0433420312, 0.0081537062,
0.0156137782, 0.0432896402, 0.0488343522, 0.0191447942, 0.0598099022,
0.0069907162, 0.0408296912, 0.0298613812, 0.0614052022, 0.0061426502,
0.0097676332, 0.0354280242, 0.0372933212, 0.0130974212, 0.0022172112,
0.0402114242, 0.0063038722, 0.0301466432, 0.0320339102, 0.0245904292,
0.0779917522, 0.0172156972, 0.0147311782, 0.0480258512, 0.0316871712,
0.0324477412, 0.0322786442, 0.0173019162, 0.0134506982, 0.0402077862,
0.0426696462, 0.0345675212, 0.0346313502, 1.93312e-05, 4.7512e-06,
5.41312e-05, 3.12e-08, 1.91312e-05, 9.642e-07, 6.0112e-06, 3.12e-08,
1.0812e-06, 4.412e-07, 7.72312e-05, 4.382e-07, 0.0005851852,
0.0002470232, 7.8912e-06, 3.1612e-06, 2.1712e-06, 7.5912e-06,
9.29312e-05, 0.0001160552, 5.51312e-05, 7.8212e-06, 6.6812e-06,
2.0912e-06, 0.0001043732, 4.1912e-06, 1.27312e-05, 0.0001975332,
0.0001513812, 0.0001073372, 6.54312e-05, 0.0002255952, 0.0001426622,
0.0001689042, 3.50312e-05, 0.0003652732, 0.0001742852, 0.0003393582,
8.70312e-05, 0.0001367102, 0.0001566652, 0.0002242122, 0.0002053362,
8.87312e-05, 0.0003058052, 0.0001336462, 0.0001512112, 0.0001072602,
0.0001626102, 0.0001522802, 6.88312e-05, 0.0001138952, 0.0002492892,
0.0002425912, 0.0007929912, 0.0076409822, 0.0049373582, 0.0004223922,
0.0009535442, 0.0009512182, 0.0006713372, 0.0011064372, 0.0026065992,
0.0030068982, 0.0019116772, 0.0013541412, 0.0124617692, 0.0004349482,
0.0023764912, 0.0078575922, 0.0004369202, 0.0004881912, 0.0003481772,
0.0009314802, 0.0003240052, 0.0049453522, 0.0006938762, 0.0004796032,
0.0008434462, 0.0014197062, 0.0015475092, 8.16312e-05, 6.63312e-05,
0.0001016142, 3.08312e-05, 0.0001470702, 5.13312e-05, 0.0001095102,
2.39312e-05, 0.0002255062, 4.28312e-05, 0.0002308162, 2.10312e-05,
0.0001356312, 0.0001242042, 0.0002451592, 0.0002754772, 3.18312e-05,
0.0001751912, 0.0001802232, 0.0002467002, 0.0003787392, 4.35312e-05,
0.0002678552, 7.20312e-05, 7.65312e-05, 8.79312e-05, 0.0001300572,
0.0001114932, 3.17312e-05, 0.0002001272, 3.1512e-06, 8.75312e-05,
3.1412e-06, 6.9212e-06, 0.0001659672, 5.98312e-05, 0.0002013862,
5.9512e-06, 2.57312e-05, 2.53312e-05, 3.27312e-05, 0.0001374772,
0.0001344332, 6.172e-07, 3.90312e-05, 0.0188869402, 0.0503434972,
4.15312e-05, 1.67312e-05, 0.0001726452, 4.95312e-05, 1.27312e-05,
9.85312e-05, 4.28312e-05, 0.0027084332, 0.0032156172, 0.0045711912,
0.0017135802, 0.0243532152, 0.0066607792, 0.0031989182, 0.0030944172,
0.0047891942, 0.0028169862, 0.0215873442, 0.0020847562, 0.0037806512,
0.0217515262, 0.0090971742, 0.0122162562, 0.0011257962, 0.0130435652,
0.0055148042, 0.0083239932, 0.0268987952, 0.0021491662, 0.0080216542,
0.0066735982, 0.0053911702, 0.0185785902, 0.0137863282, 0.0008059812,
0.0012895362, 0.0024514472, 0.0002341382, 0.0016947642, 0.0002882062,
0.0023575092, 0.0008561602, 0.0015975512, 0.0001175692, 0.0001666122,
0.0003673192, 0.0010398722, 0.0017795592, 0.0004381232, 0.0010125462,
0.0005299672, 0.0031931172, 0.0025627332, 0.0027740412, 0.0030131672,
0.0013492282, 0.0016463272, 0.0011142532, 0.0012079132, 0.0028049802,
0.0003664502
), .Dim = c(27L, 13L), .Dimnames = list(
c(
"2", "0",
"0", "0", "1", "0", "0", "1", "1", "1", "2", "0", "0", "1", "2",
"2", "1", "2", "2", "2", "2", "1", "1", "2", "2", "0", "0"
),
c(
"Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6", "Gene7",
"Gene8", "Gene9", "Gene10", "Gene11", "Gene12", "Gene13"
)
))
# At the moment we are working with a matrix, i.e.
class(my_data)
#> [1] "matrix"
# but we need a data.frame, so
my_data <- as.data.frame(my_data)
# Apply boxcox transformatio to all columns and extract transformed data
my_data_boxcox <- lapply(my_data, function(x) {
boxcox_transformation <- boxcox(x)
transformed_data <- boxcox_transformation$x.t
transformed_data
})
# and format as a data.frame
result <- as.data.frame(do.call("cbind", my_data_boxcox))
The previous code is important, the following is just a graphical check
# plot original data
pivot_longer(my_data, everything()) %>%
mutate(name = factor(name, levels = str_sort(unique(name), numeric = TRUE))) %>%
ggplot() +
geom_histogram(aes(x = value), bins = 10) +
facet_wrap(vars(name), scales = "free")
# plot after boxcox trans
pivot_longer(result, everything()) %>%
mutate(name = factor(name, levels = str_sort(unique(name), numeric = TRUE))) %>%
ggplot() +
geom_histogram(aes(x = value), bins = 10) +
facet_wrap(vars(name), scales = "free")
Created on 2020-03-08 by the reprex package (v0.3.0)
来源:https://stackoverflow.com/questions/60591514/trouble-shooting-box-cox-transformation-in-r-need-to-use-for-loop-or-apply