Trouble-shooting Box Cox transformation in R ( need to use for loop or apply)

前端 未结 1 1226
野的像风
野的像风 2021-01-29 04:41

Please find below my data ( rows are disease group 0= control, 1=Ulcerative Colitis and 2=Crohns), columns are gene expression values.

     structure(c(5.54312e         


        
1条回答
  •  伪装坚强ぢ
    2021-01-29 05:37

    I would solve your problem as follows.

    # packages
    library(bestNormalize)
    library(tidyr)
    library(ggplot2)
    library(dplyr)
    library(stringr)
    
    # data
    my_data <- structure(c(
      5.54312e-05, 5.6112e-06, 9.74312e-05, 1.3612e-06,
      1.29312e-05, 7.2512e-06, 0.0002159302, 3.6312e-06, 0.0001467552,
      1.53312e-05, 0.0009132182, 1.9312e-06, 0.0074214952, 0.0006480372,
      5.1312e-06, 6.1812e-06, 4.7612e-06, 0.0001199302, 0.0008845182,
      0.0008506632, 0.0002366382, 7.3912e-06, 8.5112e-06, 2.63312e-05,
      0.0013685242, 1.12312e-05, 0.0001775992, 0.0063385632, 0.0061628972,
      0.0406951632, 0.0132550862, 0.0330866502, 0.0741588422, 0.0049675282,
      0.0124742612, 0.0432014482, 0.0114703162, 0.0384477822, 0.0188251552,
      0.0277018382, 0.0633737932, 0.0053745442, 0.0488762832, 0.0099598792,
      0.0044341092, 0.0041768872, 0.0152889442, 0.0602260842, 0.0512892512,
      0.0065575852, 0.0174603572, 0.0076848152, 0.0021076082, 0.0057732232,
      0.0761864242, 0.0376310742, 0.0521594242, 0.0121793962, 0.0471997972,
      0.0224588692, 0.0302616442, 0.0062663212, 0.0286649272, 0.0228584812,
      0.0280185812, 0.0176817072, 0.0405636232, 0.0297912062, 0.0347780872,
      0.0193185042, 0.0118479432, 0.0096142082, 0.0640275732, 0.0353341802,
      0.0416389862, 0.0560150452, 0.0330486812, 0.0176602362, 0.0301871972,
      0.0579195622, 0.0299905202, 0.0001129152, 0.0009209172, 0.0010817792,
      0.0001951902, 0.0016784762, 0.0001716432, 0.0001917332, 0.0005600662,
      0.0003840872, 0.0004548142, 0.0007234162, 0.0002039282, 0.0009733682,
      0.0008222022, 0.0006205572, 0.0002608002, 0.0002146382, 0.0020774742,
      0.0006584612, 0.0004037032, 0.0003786822, 0.0004093372, 0.0017226182,
      0.0002138162, 0.0001766742, 0.0020229092, 0.0018869602, 0.0530292672,
      0.0225949962, 0.0119676672, 0.0268511442, 0.0377380112, 0.0313562992,
      0.1041032912, 0.0632652472, 0.0180284852, 0.1160380322, 0.0057282012,
      0.0536359992, 0.0591269722, 0.0118352722, 0.0046396552, 0.0143029422,
      0.0829488842, 0.0152022692, 0.0212954622, 0.0433420312, 0.0081537062,
      0.0156137782, 0.0432896402, 0.0488343522, 0.0191447942, 0.0598099022,
      0.0069907162, 0.0408296912, 0.0298613812, 0.0614052022, 0.0061426502,
      0.0097676332, 0.0354280242, 0.0372933212, 0.0130974212, 0.0022172112,
      0.0402114242, 0.0063038722, 0.0301466432, 0.0320339102, 0.0245904292,
      0.0779917522, 0.0172156972, 0.0147311782, 0.0480258512, 0.0316871712,
      0.0324477412, 0.0322786442, 0.0173019162, 0.0134506982, 0.0402077862,
      0.0426696462, 0.0345675212, 0.0346313502, 1.93312e-05, 4.7512e-06,
      5.41312e-05, 3.12e-08, 1.91312e-05, 9.642e-07, 6.0112e-06, 3.12e-08,
      1.0812e-06, 4.412e-07, 7.72312e-05, 4.382e-07, 0.0005851852,
      0.0002470232, 7.8912e-06, 3.1612e-06, 2.1712e-06, 7.5912e-06,
      9.29312e-05, 0.0001160552, 5.51312e-05, 7.8212e-06, 6.6812e-06,
      2.0912e-06, 0.0001043732, 4.1912e-06, 1.27312e-05, 0.0001975332,
      0.0001513812, 0.0001073372, 6.54312e-05, 0.0002255952, 0.0001426622,
      0.0001689042, 3.50312e-05, 0.0003652732, 0.0001742852, 0.0003393582,
      8.70312e-05, 0.0001367102, 0.0001566652, 0.0002242122, 0.0002053362,
      8.87312e-05, 0.0003058052, 0.0001336462, 0.0001512112, 0.0001072602,
      0.0001626102, 0.0001522802, 6.88312e-05, 0.0001138952, 0.0002492892,
      0.0002425912, 0.0007929912, 0.0076409822, 0.0049373582, 0.0004223922,
      0.0009535442, 0.0009512182, 0.0006713372, 0.0011064372, 0.0026065992,
      0.0030068982, 0.0019116772, 0.0013541412, 0.0124617692, 0.0004349482,
      0.0023764912, 0.0078575922, 0.0004369202, 0.0004881912, 0.0003481772,
      0.0009314802, 0.0003240052, 0.0049453522, 0.0006938762, 0.0004796032,
      0.0008434462, 0.0014197062, 0.0015475092, 8.16312e-05, 6.63312e-05,
      0.0001016142, 3.08312e-05, 0.0001470702, 5.13312e-05, 0.0001095102,
      2.39312e-05, 0.0002255062, 4.28312e-05, 0.0002308162, 2.10312e-05,
      0.0001356312, 0.0001242042, 0.0002451592, 0.0002754772, 3.18312e-05,
      0.0001751912, 0.0001802232, 0.0002467002, 0.0003787392, 4.35312e-05,
      0.0002678552, 7.20312e-05, 7.65312e-05, 8.79312e-05, 0.0001300572,
      0.0001114932, 3.17312e-05, 0.0002001272, 3.1512e-06, 8.75312e-05,
      3.1412e-06, 6.9212e-06, 0.0001659672, 5.98312e-05, 0.0002013862,
      5.9512e-06, 2.57312e-05, 2.53312e-05, 3.27312e-05, 0.0001374772,
      0.0001344332, 6.172e-07, 3.90312e-05, 0.0188869402, 0.0503434972,
      4.15312e-05, 1.67312e-05, 0.0001726452, 4.95312e-05, 1.27312e-05,
      9.85312e-05, 4.28312e-05, 0.0027084332, 0.0032156172, 0.0045711912,
      0.0017135802, 0.0243532152, 0.0066607792, 0.0031989182, 0.0030944172,
      0.0047891942, 0.0028169862, 0.0215873442, 0.0020847562, 0.0037806512,
      0.0217515262, 0.0090971742, 0.0122162562, 0.0011257962, 0.0130435652,
      0.0055148042, 0.0083239932, 0.0268987952, 0.0021491662, 0.0080216542,
      0.0066735982, 0.0053911702, 0.0185785902, 0.0137863282, 0.0008059812,
      0.0012895362, 0.0024514472, 0.0002341382, 0.0016947642, 0.0002882062,
      0.0023575092, 0.0008561602, 0.0015975512, 0.0001175692, 0.0001666122,
      0.0003673192, 0.0010398722, 0.0017795592, 0.0004381232, 0.0010125462,
      0.0005299672, 0.0031931172, 0.0025627332, 0.0027740412, 0.0030131672,
      0.0013492282, 0.0016463272, 0.0011142532, 0.0012079132, 0.0028049802,
      0.0003664502
    ), .Dim = c(27L, 13L), .Dimnames = list(
      c(
        "2", "0",
        "0", "0", "1", "0", "0", "1", "1", "1", "2", "0", "0", "1", "2",
        "2", "1", "2", "2", "2", "2", "1", "1", "2", "2", "0", "0"
      ),
      c(
        "Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6", "Gene7",
        "Gene8", "Gene9", "Gene10", "Gene11", "Gene12", "Gene13"
      )
    ))
    
    # At the moment we are working with a matrix, i.e.
    class(my_data)
    #> [1] "matrix"
    
    # but we need a data.frame, so
    my_data <- as.data.frame(my_data)
    
    # Apply boxcox transformatio to all columns and extract transformed data
    my_data_boxcox <- lapply(my_data, function(x) {
      boxcox_transformation <- boxcox(x)
      transformed_data <- boxcox_transformation$x.t
      transformed_data
    })
    
    # and format as a data.frame
    result <- as.data.frame(do.call("cbind", my_data_boxcox))
    

    The previous code is important, the following is just a graphical check

    # plot original data
    pivot_longer(my_data, everything()) %>% 
      mutate(name = factor(name, levels = str_sort(unique(name), numeric = TRUE))) %>% 
      ggplot() + 
      geom_histogram(aes(x = value), bins = 10) + 
      facet_wrap(vars(name), scales = "free")
    

    # plot after boxcox trans
    pivot_longer(result, everything()) %>% 
      mutate(name = factor(name, levels = str_sort(unique(name), numeric = TRUE))) %>% 
      ggplot() + 
      geom_histogram(aes(x = value), bins = 10) + 
      facet_wrap(vars(name), scales = "free")
    

    Created on 2020-03-08 by the reprex package (v0.3.0)

    0 讨论(0)
提交回复
热议问题