Data cleaning of dollar values and percentage in R

I've been searching for a number of packages in R to help me in converting dollar values to nice numerical values. I don't seem to be able to find one (in plyr package for example). The basic thing I'm looking for is simply removing the $ sign as well as translating "M" and "K" for Millions and thousands respectively.

To replicate, I can use this code below:

require(XML)
theurl <- "http://www.kickstarter.com/help/stats"
html <- htmlParse(theurl)

allProjects <- readHTMLTable(html)[[1]]
names(allProjects) <-  c("Category","LaunchedProjects","TotalDollars","SuccessfulDollars","UnsuccessfulDollars","LiveDollars","LiveProjects","SuccessRate")

The data looks like this:

> tail(allProjects)
      Category LaunchedProjects TotalDollars SuccessfulDollars UnsuccessfulDollars LiveDollars
8         Food            3,069     $16.79 M          $13.18 M             $2.78 M   $822.64 K
9      Theater            4,155     $13.45 M          $12.01 M             $1.22 M   $217.86 K
10      Comics            2,242     $12.88 M          $11.07 M           $941.31 K   $862.18 K
11     Fashion            2,799      $9.62 M           $7.59 M             $1.44 M   $585.98 K
12 Photography            2,794      $6.76 M           $5.48 M             $1.06 M   $220.75 K
13       Dance            1,185      $3.43 M           $3.13 M           $225.82 K     $71,322
   LiveProjects SuccessRate
8           189      39.27%
9           111      64.09%
10          134      46.11%
11          204      27.24%
12           83      36.81%
13           40      70.22%

I ended up writing my own function:

dollarToNumber <- function(vectorInput) {
  result <- c()
  for (dollarValue in vectorInput) {
    if (is.factor(dollarValue)) {  
      dollarValue = levels(dollarValue)
    }
    dollarValue <- gsub("(\\$|,)","",dollarValue)
    if(grepl(" K",dollarValue)) {
      dollarValue <- as.numeric(gsub(" K","",dollarValue)) * 1000
    } else if (grepl(" M",dollarValue)) {
      dollarValue <- as.numeric(gsub(" M","",dollarValue)) * 1000000
    }  
    if (!is.numeric(dollarValue)) {
      dollarValue <- as.numeric(dollarValue)
    }
    result <- append(result,dollarValue)
  }
    result
}

Then I used it to get what I wanted:

 allProjects <- transform(allProjects,
                          LaunchedProjects = as.numeric(gsub(",","",levels(LaunchedProjects))),
                          TotalDollars = dollarToNumber(TotalDollars),
                          SuccessfulDollars = dollarToNumber(SuccessfulDollars),
                          UnsuccessfulDollars = dollarToNumber(UnsuccessfulDollars),
                          LiveDollars = dollarToNumber(LiveDollars),
                          LiveProjects = as.numeric(LiveProjects),
                          SuccessRate = as.numeric(gsub("%","",SuccessRate))/100)

Which will give me this result below:

> str(allProjects)
'data.frame':   13 obs. of  8 variables:
 $ Category           : Factor w/ 13 levels "Art","Comics",..: 6 8 4 9 12 11 1 7 13 2 ...
 $ LaunchedProjects   : num  10006 1185 1860 20025 2242 ...
 $ TotalDollars       : num  1.11e+08 9.68e+07 6.89e+07 6.66e+07 4.31e+07 ...
 $ SuccessfulDollars  : num  90990000 84960000 59020000 59390000 34910000 ...
 $ UnsuccessfulDollars: num  16640000 7900000 6830000 5480000 3700000 ...
 $ LiveDollars        : num  3090000 3970000 3010000 1750000 4470000 ...
 $ LiveProjects       : num  13 7 6 11 3 10 8 4 1 2 ...
 $ SuccessRate        : num  0.394 0.338 0.382 0.541 0.334 ...

I'm new to R and I felt the code I've written is so ugly, surely there's a better way to do this without reinventing the wheel? I've used apply, aaply, ddply functions with no success (I was trying not to use the for loop as well...). On top of that, when dealing with the SuccessRate column, I couldn't find something like an as.percentage function in R. What am I missing?

Any guidance will be much appreciated!

A solution that uses parse and eval:

ToNumber <- function(X)
{
  A <- gsub("%","*1e-2",gsub("K","*1e+3",gsub("M","*1e+6",gsub("\\$|,","",as.character(X)),fixed=TRUE),fixed=TRUE),fixed=TRUE)
  B <- try(sapply(A,function(a){eval(parse(text=a))}),silent=TRUE)
  if (is.numeric(B)) return (as.numeric(B)) else return(X)
}

#----------------------------------------------------------------------
# Example:
X <-
  read.table( header=TRUE,
              text = 
   'Category LaunchedProjects TotalDollars SuccessfulDollars UnsuccessfulDollars LiveDollars  LiveProjects SuccessRate
        Food            3,069    "$16.79 M"         "$13.18 M"            "$2.78 M"  "$822.64 K" 189      39.27%
     Theater            4,155    "$13.45 M"         "$12.01 M"            "$1.22 M"  "$217.86 K" 111      64.09%
      Comics            2,242    "$12.88 M"         "$11.07 M"          "$941.31 K"  "$862.18 K" 134      46.11%
     Fashion            2,799     "$9.62 M"          "$7.59 M"            "$1.44 M"  "$585.98 K" 204      27.24%
 Photography            2,794     "$6.76 M"          "$5.48 M"            "$1.06 M"  "$220.75 K"  83      36.81%
       Dance            1,185     "$3.43 M"          "$3.13 M"          "$225.82 K"    "$71,322"  40      70.22%' )

numX <- as.data.frame(lapply(as.list(X),ToNumber))

options(width=1000)
print(numX,row.names=FALSE)

#    Category LaunchedProjects TotalDollars SuccessfulDollars UnsuccessfulDollars LiveDollars LiveProjects SuccessRate
#        Food             3069     16790000          13180000             2780000      822640          189      0.3927
#     Theater             4155     13450000          12010000             1220000      217860          111      0.6409
#      Comics             2242     12880000          11070000              941310      862180          134      0.4611
#     Fashion             2799      9620000           7590000             1440000      585980          204      0.2724
# Photography             2794      6760000           5480000             1060000      220750           83      0.3681
#       Dance             1185      3430000           3130000              225820       71322           40      0.7022

One thing that makes R different from other languages you might be used to is that it's better to do things in a "vectorized" way, to operate on a whole vector at a time rather than looping through each individual value. So your dollarToNumber function can be rewritten without the for loop:

dollarToNumber_vectorised <- function(vector) {
  # Want the vector as character rather than factor while
  # we're doing text processing operations
  vector <- as.character(vector)
  vector <- gsub("(\\$|,)","", vector)
  # Create a numeric vector to store the results in, this will give you
  # warning messages about NA values being introduced because the " K" values
  # can't be converted directly to numeric
  result <- as.numeric(vector)
  # Find all the "$N K" values, and modify the result at those positions
  k_positions <- grep(" K", vector)
  result[k_positions] <- as.numeric(gsub(" K","", vector[k_positions])) * 1000
  # Same for the "$ M" value
  m_positions <- grep(" M", vector)
  result[m_positions] <- as.numeric(gsub(" M","", vector[m_positions])) * 1000000
  return(result)
}

It still gives the same output as your original function:

> dollarToNumber_vectorised(allProjects$LiveDollars)
 [1] 3100000 3970000 3020000 1760000 4510000  762650  510860  823370  218590  865940
[11]  587670  221110   71934
# Don't worry too much about this warning
Warning message:
In dollarToNumber_vectorised(allProjects$LiveDollars) :
  NAs introduced by coercion
> dollarToNumber(allProjects$LiveDollars)
 [1] 3100000 3970000 3020000 1760000 4510000  762650  510860  823370  218590  865940
[11]  587670  221110   71934

来源：https://stackoverflow.com/questions/15014333/data-cleaning-of-dollar-values-and-percentage-in-r

标签

data-cleaning