Convert written number to number in R

后端 未结 2 586
小鲜肉
小鲜肉 2020-11-29 07:23

Does anybody know a function to convert a text representation of a number into an actual number, e.g. \'twenty thousand three hundred and five\' into 20305. I have written n

相关标签:
2条回答
  • 2020-11-29 08:02

    Here's a start that should get you to hundreds of thousands.

    word2num <- function(word){
        wsplit <- strsplit(tolower(word)," ")[[1]]
        one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5,
                           six=6, seven=7, eight=8, nine=9)
        teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
                      sixteen=16, seventeen=17, eighteen=18, nineteen=19)
        ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
                           sixty=60, seventy=70, eighty=80, ninety=90)
        doubles <- c(teens,ten_digits)
        out <- 0
        i <- 1
        while(i <= length(wsplit)){
            j <- 1
            if(i==1 && wsplit[i]=="hundred")
                temp <- 100
            else if(i==1 && wsplit[i]=="thousand")
                temp <- 1000
            else if(wsplit[i] %in% names(one_digits))
                temp <- as.numeric(one_digits[wsplit[i]])
            else if(wsplit[i] %in% names(teens))
                temp <- as.numeric(teens[wsplit[i]])
            else if(wsplit[i] %in% names(ten_digits))
                temp <- (as.numeric(ten_digits[wsplit[i]]))
            if(i < length(wsplit) && wsplit[i+1]=="hundred"){
                if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
                    out <- out + 100*temp
                else
                    out <- 100*(out + temp)
                j <- 2
            }
            else if(i < length(wsplit) && wsplit[i+1]=="thousand"){
                if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
                    out <- out + 1000*temp
                else
                    out <- 1000*(out + temp)
                j <- 2
            }
            else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){
                temp <- temp*100
                out <- out + temp
            }
            else{
                out <- out + temp
            }
            i <- i + j
        }
        return(list(word,out))
    }
    

    Results:

    > word2num("fifty seven")
    [[1]]
    [1] "fifty seven"
    
    [[2]]
    [1] 57
    
    > word2num("four fifty seven")
    [[1]]
    [1] "four fifty seven"
    
    [[2]]
    [1] 457
    
    > word2num("six thousand four fifty seven")
    [[1]]
    [1] "six thousand four fifty seven"
    
    [[2]]
    [1] 6457
    
    > word2num("forty six thousand four fifty seven")
    [[1]]
    [1] "forty six thousand four fifty seven"
    
    [[2]]
    [1] 46457
    
    > word2num("forty six thousand four hundred fifty seven")
    [[1]]
    [1] "forty six thousand four hundred fifty seven"
    
    [[2]]
    [1] 46457
    
    > word2num("three forty six thousand four hundred fifty seven")
    [[1]]
    [1] "three forty six thousand four hundred fifty seven"
    
    [[2]]
    [1] 346457
    

    I can tell you already that this won't work for word2num("four hundred thousand fifty"), because it doesn't know how to handle consecutive "hundred" and "thousand" terms, but the algorithm can be modified probably. Anyone should feel free to edit this if they have improvements or build on them in their own answer. I just thought this was a fun problem to play with (for a little while).

    Edit: Apparently Bill Venables has a package called english that may achieve this even better than the above code.

    0 讨论(0)
  • 2020-11-29 08:09

    Here's what I think is a better solution.

        library(stringdist)
        library(gdata)
        #Convert numeric words to digits
    isNumericWord=function(string, dist=1, method="dl"){
      nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
             "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
             "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
             "hundred","thousand","million","billion","trillion")
      return(any(stringdist(tolower(string),nums,method=method)<=dist))
    }
    numberTypes=function(string, dist=1, method="dl"){
      nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
             "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
             "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
             "hundred","thousand","million","billion","trillion")
      string=gsub("[[:punct:]]"," ",string)
      wrdsplit=strsplit(string,split=" ")[[1]]
      wrdsplit=wrdsplit[wrdsplit!=""]
      #Handle number types
      wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
      wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
      wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
      wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & 
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
      wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & 
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
      wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & 
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
      wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
      wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
      wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
                        tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
      wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
      wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
      wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
      wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
      wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
      wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
      wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
      wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
      wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
      #Handle other number words that end in "th"
      if(length(wrdsplit)>0){
        for(i in 1:length(wrdsplit)){
          substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
          substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
          if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
            wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
          }
        }
        return(gsub("  "," ",paste(wrdsplit,collapse=" ")))
      }else{
        return("")
      }
    }
    
    #Convert number words to digits
    Word2Num=function(string, dist=1, method="dl"){
      original=string
      #Define numbers
      one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
                        six=6, seven=7, eight=8, nine=9)
      teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
                   sixteen=16, seventeen=17, eighteen=18, nineteen=19)
      ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
                        sixty=60, seventy=70, eighty=80, ninety=90)
      large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
      double_digits = c(teens,ten_digits)
    
      #Split the string into words
      string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
      string=numberTypes(string)
      wrdsplit=strsplit(tolower(string)," ")[[1]]
      wrdsplit=wrdsplit[wrdsplit!=""]
      isNumber=apply(data.frame(wrdsplit),1,isNumericWord)
    
      #Find groups of numbers
      if(exists("groups")){
        suppressWarnings(rm(groups))
      }
      i=1
      while(i <= length(wrdsplit)){
        if(isNumber[i]==T){
          if(!exists("groups")){
            groups=list(wrdsplit[i])
          }else if(exists("groups")){
            groups=c(groups, wrdsplit[i])
          }
          for(j in (i+1):length(wrdsplit)){
            if(isNumber[j]){
              groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
              i=j+1
            }else{
              i=i+1
              break
            }
          }
        }else{
          i=i+1
        }
      }
    
      #Convert numeric words to numbers
      if(exists("groups")){
        groupNums=groups
        for(j in 1:length(groups)){
          for(i in 1:length(groups[[j]])){
            #If word is a single digit number
            if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & 
                   tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
              #If word is a single digit number
              groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
            }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
              #If word is a double digit number
              groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
            }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
              #If word is a large digit number
              groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
            }
          }
        }
    
        #Convert the separated numbers to a single number
        defscipen=options("scipen")[[1]]
        options(scipen=999)
        for(i in 1:length(groups)){
          if(length(groupNums[[i]])==1){
            groupNums[[i]]=as.numeric(groupNums[[i]][1])
          }else{
            while(length(groupNums[[i]])>=2){
              if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
                #If the next word has more digits than the current word, multiply them
                temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
              }else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
                #if the next word has less digits than the current word, add them
                temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
              }
              #Combine the results
              if(length(groupNums[[i]])>2){
                groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
              }else{
                groupNums[[i]]=temp
              }
            }
          }
        }
        #Recreate the original string
        groupNums=lapply(groupNums, as.character)
        options(scipen=defscipen)
        for(i in 1:length(groups)){
          wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
          if(length(groups[[i]]>1)){
            wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
          }
        }
        #Combine numbers with their endings
        wrdsplit=wrdsplit[wrdsplit!=""]
        if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
          locs=which(wrdsplit %in% unlist(groupNums))
          for(i in length(locs):1){
            wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
            wrdsplit=wrdsplit[-(locs[i]+1)]
          }
        }
        return(trim(paste(wrdsplit,collapse=" ")))
      }else{
        return(original)
      }
    }
    
    0 讨论(0)
提交回复
热议问题