问题
I have searched and found many solutions that came close, but never quite worked in the end. This is probably something very simple, for those with experience...
Here is a snippet of my data. This was created automatically from a JSON import by the package jsonlite. The data is very nicely structured, but I am nevertheless helpless. Update2: I have added the relevant data below
structure(list(rightsize = c(42L, 50L, 52L, 49L, 41L, 41L, 41L,
41L, 41L, 45L, 47L, 42L, 45L, 46L, 42L, 44L, 44L, 37L, 44L, 41L
), hitlen = c("", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", ""), linegroup = c("_", "_", "_",
"_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_",
"_", "_", "_", "_"), leftsize = c(46L, 43L, 43L, 37L, 49L, 43L,
43L, 45L, 45L, 43L, 44L, 46L, 45L, 46L, 44L, 43L, 54L, 45L, 51L,
47L), leftspace = c(" ", " ", " ",
" ", " ", " ", " ", " ",
" ", " ", " ", " ", " ",
" ", " ", " ", "", " ", " ",
" "), Left = list(structure(list(class = c("", "coll",
""), str = c("patients with ", "chronic", " obstructive pulmonary"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("respect to ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("While there is no cure for this ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "strc", "", "coll", ""), str = c(".",
"</p><p>", "When patients with ", "chronic", " liver")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c("bronchitis , and ", "chronic",
" obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("offers the possibility that ",
"chronic", " lung")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" , such as ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("always as clear in other ",
"chronic", " incurable")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("may have the potential to prevent ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" half the estimated cost of all ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("is consistent with the tact that ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("used to treat ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("ingredient for dietary therapy of ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("patients with ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("greater for ", "chronic",
" obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" departments , with schemes for ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("postponement of death by means of managing ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("certainly be ",
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c("cardiovascular disease , cancer , other ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c("terminal illnesses are converted to ",
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L))), Right = list(structure(list(class = "", str = " who may be at risk of developing steroid"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " - plausibly related to exposure to environmental"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " , it can be treated , Black says . Antidepressants"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " ask what they can do to improve their condition"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " [ COPD ] ) was 15 % ( estimated within "), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " is part of the continuum of development"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " ( 70 , 71 ) and sleep apnea . Elevation"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " . Patients with heart failure highlight"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " other than heart disease , and helps us"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " in this country . Furthermore , the portion"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " are multigenic and multifactorial . Therefore"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " . Nasal corticosteroids are increasingly"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " such as diabetes mellitus or hyperlipidemia"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " ( COPD ) concluded exercise relieves dyspnea"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " than for any other disease. 5 The number"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " management in patients with COPD receiving"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " and disability is costly , and it is bound"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "strc", ""), str = c(" .", "</p><p>", "Much rarer condition , but people"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = " , and in fact those rates have been rising"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " . The panel 's report is negative about"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " disease"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "diseases"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), toknum = c(580661252L,
585871494L, 572902309L, 596182644L, 611091300L, 604962106L, 605346237L,
585102838L, 575701411L, 616556239L, 548908661L, 604489309L, 548601059L,
617460845L, 585870185L, 591049175L, 581965276L, 592616458L, 592591831L,
599295354L), rightspace = c(" ", " ", "", " ", " ",
" ", " ", " ", " ", " ",
" ", " ", " ", " ", " ", " ",
" ", " ", " ", " "), Tbl_refs = list(
"11.99.0023.006", "11.99.0031.001", "11.99.0012.004", "11.99.0046.013",
"11.99.0069.003", "11.99.0059.007", "11.99.0060.003", "11.99.0030.001",
"11.99.0016.007", "11.99.0077.021", "11.01.0003.015", "11.99.0059.003",
"11.01.0003.006", "11.99.0078.034", "11.99.0031.001", "11.99.0038.005",
"11.99.0025.005", "11.99.0040.006", "11.99.0040.006", "11.99.0051.011"),
ref = c("11.99.0023.006", "11.99.0031.001", "11.99.0012.004",
"11.99.0046.013", "11.99.0069.003", "11.99.0059.007", "11.99.0060.003",
"11.99.0030.001", "11.99.0016.007", "11.99.0077.021", "11.01.0003.015",
"11.99.0059.003", "11.01.0003.006", "11.99.0078.034", "11.99.0031.001",
"11.99.0038.005", "11.99.0025.005", "11.99.0040.006", "11.99.0040.006",
"11.99.0051.011")), .Names = c("rightsize", "hitlen", "linegroup",
"leftsize", "leftspace", "Left", "Right", "Kwic", "toknum", "rightspace",
"Tbl_refs", "ref"), class = "data.frame", row.names = c(NA, 20L
))
What I need to do is 1) transpose these 4 dataframes and assign the values in "class" to be the column headers. Note, #1, the number of columns may differ. Also note (#2) that some of the column names will be "". As such, the wonderful solution here results in dataframes in which some column headings are all filled with junk, making the next step (dataframe merging) impossible, e.g.
- ""
- strc
- structure("When patients with ", class = "AsIs")
- coll
- structure(" liver", class = "AsIs").
(The junk-fill headers seem to be the ones that were "", beyond the first.)
Following that step, I would then need to merge these dataframes, whilst accounting for missing values. Rbind.fill does the trick, but only when the data is sufficiently uniform. I have searched high & low for a solution, and have yet to find one that sufficiently addresses this issue.
Update: I have continued to experiment with melt/cast. The following brings be very close to an acceptable, final solutions:
require(reshape2)
docx <- melt(documentdata$Left, id.vars = c("class"))
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
The only problem is, as mentioned, the blank "class" causes the structure to be lost upon dcast: all of the unnamed columns wind up merged and out of order, e.g.
L1 variable Var.3 coll strc
1 1 str patients with , obstructive pulmonary chronic
2 2 str respect to , obstructive pulmonary chronic
3 3 str While there is no cure for this , chronic
4 4 str ., When patients with , liver chronic </p><p>
5 5 str bronchitis , and , obstructive pulmonary chronic
The key "class" in the og data is the variable "coll", which always has at least one blank before and one blank after. One solution might be to create names "pre-coll" and "post-coll" prior to dcast?
Update #3: here's one possible, albeit ugly solution. Any "cleaner" options?
require(reshape2)
docx <- melt(documentdata$Left, id.vars = c("class"))
pre <- which(docx$class %in% c("coll")) - 1
post <- which(docx$class %in% c("coll")) + 1
docx$class[pre] = "l.pre"
docx$class[post] = "l.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
docx.left <- docx[, c("l.pre", "coll", "l.post")]
Thanks in advance for the help.
回答1:
Let's do it with dplyr
:
library(dplyr)
documentdata$Left %>% do.call(rbind, .) %>%
do(data.frame(pre = .[["str"]][which(.[["class"]]=="coll")-1],
coll = .[["str"]][which(.[["class"]]=="coll")],
post = .[["str"]][which(.[["class"]]=="coll")+1]))
pre coll post
1 patients with chronic obstructive pulmonary
2 respect to chronic obstructive pulmonary
3 While there is no cure for this chronic
4 When patients with chronic liver
5 bronchitis , and chronic obstructive pulmonary
6 offers the possibility that chronic lung
....
18 certainly be chronic obstructive pulmonary
19 cardiovascular disease , cancer , other chronic
20 terminal illnesses are converted to chronic
EDIT: an explanation:
dplyr
has a weird syntax. See the dplyr vignette or the data wrangling cheat sheet. The %>%
is the pipe from the magrittr
package and simply puts the output of everything on the left of the pipe as the first argument if the function to the right:
5 %>% c(1)
#same as
c(5, 1)
You can use the .
to represent the stuff on the left if you want to use it somewhere else instead. You can subset the .
if you like (eg the .[["str"]]
) :
5 %>% c(1, .)
#same as
c(1, 5)
do
allows us to do any computation we want, without worrying about the standard dplyr
verbs - it's a wrapper. See ?do
.
So the answer takes the documentdata$Left
, pipes it into do.call(rbind, .)
which collapses the list (so far this is the same as do.call(rbind, documentdata$Left)
). The we pipe that to the do
which makes a new data frame with the relevant columns selected from the .
.
来源:https://stackoverflow.com/questions/32220184/in-r-transpose-and-combine-multiple-dataframes-with-missing-data-and-blank-colum