R: building text Classifier

拈花ヽ惹草 提交于 2019-12-10 11:47:37

问题


I have content set that has to be classified based on few rules .

sample data:

    1     chin jeffrey hong kong  wednesday  october     global business reporting cc subramanian raghuveer   kumar m santhosh   antoo ramesh subject request  obtain global icis data dear team appreciate   can distribute   monthly basis  latest global icis data    ramesh antoo upon  availability regards jeffrey chin associate business risk strategy  efficiency brse  asia international  institutional banking australia  new zealand banking group limited f three exchange square  connaught place central hong kong phone    voice net   email jeffreychinanzcom brse   key business risk interface team within iib providing global support  strategic perspectives  policy procedures  reporting  includes risk appetite credit process quality assurance interlock  key stakeholders  well  iib support  key projects   project glue cacheorion working groups efficiency initiatives  business risk forums p please consider  environment  printing  email
    2      beren stuart vanuatu  monday  october     g s venkatesh ramesh sandeep talanki   h r nagaraj subject please approve  qlikview gpa access hi please process  following form  gpa access please email  requestor  line manager   access  granted raj can  please add  given  user  qlikview workgroup    gpa access form  requestors name lim tek kon vanuatu address lini high way port vila efate title  relationship manager emerging corporates employee id  lan id limtk bsbcc  authorising manager beren stuart vanuatu  yes   read  use  gpa dashboard business technical reason na  
    3     kumar m santhosh   behalf  relationshipbankingfinancesupport  friday  october     g s venkatesh cc global business reporting subject fw please approve  qlikview gpa access regards santhosh   faunt daniel png  wednesday  october     relationshipbankingfinancesupport cc amet sova subject fw please approve  qlikview gpa access hi   unable  approve  excel due  macro issues please process   amet sova  monday  october     faunt daniel png subject please approve  qlikview gpa access hello can  please review  attached form  click line manager approval  approve 
    4     thomson owen tonga  thursday  october     g s venkatesh ramesh sandeep talanki   h r nagaraj subject please approve  qlikview gpa access hi please process  following form  gpa access please email  requestor  line manager   access  granted raj can  please add  given  user  qlikview workgroup    gpa access form  requestors name hia viliami address head office fakafanua centre maufanga vuna road nukualofa tongatapu tonga nukualofa tongatapu title  nfc  amu manager employee id  lan id hiav bsbcc   authorising manager thomson owen tonga  yes   read  use  gpa dashboard business technical reason  
    5     kumar rajesh fiji  tuesday  october     g s venkatesh ramesh sandeep talanki   h r nagaraj subject please approve  qlikview gpa access hi please process  following form  gpa access please email  requestor  line manager   access  granted raj can  please add  given  user  qlikview workgroup    gpa access form  requestors

This is just one row value, where in real time i have to go through more than 500 - 10000 rows as such , here i have extracted the words which il be using

> O
$text
$text[[1]]
 [1] "qlikview" "gpa"      "access"   "gpa"      "access"   "access"   "qlikview" "gpa"      "access"  "gpa"     

$text[[2]]
 [1] "report"   "qlikview" "gpa"      "access"   "qlikview" "gpa"      "access"   "qlikview" "gpa"     
[10] "access"  

$text[[3]]
 [1] "qlikview" "gpa"      "access"   "gpa"      "access"   "access"   "qlikview" "gpa"      "access"  
[10] "gpa"     

$text[[4]]
 [1] "qlikview" "gpa"      "access"   "gpa"      "access"   "access"   "qlikview" "gpa"      "access"  
[10] "gpa"     

$text[[5]]
 [1] "report"   "qlikview" "gpa"      "access"   "access"   "gpa"      "access"   "qlikview" "gpa" "access"   "access"   "gpa"      "qlikview" "gpa"      "access"   "qlikview" "gpa"  "access"  

$text[[6]]
 [1] "report"   "qlikview" "access"   "access"   "report"   "qlikview" "access"   "access"   "gpa"     
[10] "qlikview" "access"   "access"   "qlikview" "access"   "access"  

$text[[7]]
 [1] "report" "report" "access" "access" "report" "report" "report" "report" "report" "report" "data"  "data"   "report" "access" "report" "report"

$text[[8]]
[1] "report"   "qlikview" "gpa"      "access"   "gpa"      "access"  

$text[[9]]
 [1] "report" "gpa"    "access" "access" "gpa"    "gpa"    "gpa"    "gpa"    "gpa"    "access" "gpa"   "gpa"    "gpa"    "report"

$text[[10]]
 [1] "report" "gpa"    "gpa"    "access" "gpa"    "access" "gpa"    "access" "gpa"    "gpa"    "report" "gpa"    "gpa"    "access" "gpa"    "gpa"    "gpa"    "gpa"    "gpa"  

now i have to build rule on this using if condition while , how does a list be represented to loop for each text and check if "access" is present then check for "gpa" or "qlikview" is present then return as ACCESS for the row value (sample data) else if "report" is present then check for "pfi" or "Regional" return REPORT

I have converted the above list as dataframe that looks like this

code:

maxl <- max(sapply(O,length))
out <- do.call(cbind, lapply(O,function(x) x[1:maxl]))
out <- as.data.frame(out) 
    text
1   c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
2   c("report", "qlikview", "gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", "access")
3   c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
4   c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
5   c("report", "qlikview", "gpa", "access", "access", "gpa", "access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", "gpa", "access", "qlikview", "gpa", "access")
6   c("report", "qlikview", "access", "access", "report", "qlikview", "access", "access", "gpa", "qlikview", "access", "access", "qlikview", "access", "access")

how can i remove the list type c() in this dataframe ?

tried some code:

 #Rule Classifier-----
rule <- function(out)
{
  for(i in out)
  {
    for(j in out[i])
    {
      if(x[j]=="Access")
      {
        if(x[j]=="gpa" | x[j]=="qlikview")
        {
          return("Access")
        }
      }
      else if(x[j]=="Report")
      {
        if(x[j]=="pfi" | x[j]=="data" )
        {
          return("Report")
        }
      }
    }
  }
}

Expected output:

1      Access
2       Access
3       Access
4       Access
5       Access
6       Access
7       Report/Data
8       Access
9       Access
10      Access
11     Report/Data
12     Report/Data
13     Report/Data
14     Report/Data

dput(O$text[1:10])

list(c("qlikview", "gpa", "access", "gpa", "access", "access", 
"qlikview", "gpa", "access", "gpa"), c("report", "qlikview", 
"gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", 
"access"), c("qlikview", "gpa", "access", "gpa", "access", "access", 
"qlikview", "gpa", "access", "gpa"), c("qlikview", "gpa", "access", 
"gpa", "access", "access", "qlikview", "gpa", "access", "gpa"
), c("report", "qlikview", "gpa", "access", "access", "gpa", 
"access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", 
"gpa", "access", "qlikview", "gpa", "access"), c("report", "qlikview", 
"access", "access", "report", "qlikview", "access", "access", 
"gpa", "qlikview", "access", "access", "qlikview", "access", 
"access"), c("report", "report", "access", "access", "report", 
"report", "report", "report", "report", "report", "data", "data", 
"report", "access", "report", "report"), c("report", "qlikview", 
"gpa", "access", "gpa", "access"), c("report", "gpa", "access", 
"access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa", 
"gpa", "gpa", "report"), c("report", "gpa", "gpa", "access", 
"gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa", 
"gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa"))
rule(out)
#this is throwing some error -  Error in `[.default`(out, i) : invalid subscript type 'list' 

I know this is quite naive but im new to this , please correct me if i am going wrong some where.


回答1:


You seem to be assuming that for ... in loops will iterate with an integer. They just iterate over the objects in the list, so you are trying to use i as an index when it is a list. But ?lapply will show you the better way to work with lists.

text <- list(c("qlikview", "gpa", "access", "gpa", "access", "access",
            "qlikview", "gpa", "access", "gpa"), 
          c("report", "qlikview", 
            "gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", 
            "access"), 
          c("qlikview", "gpa", "access", "gpa", "access", "access", 
            "qlikview", "gpa", "access", "gpa"), 
          c("qlikview", "gpa", "access", 
            "gpa", "access", "access", "qlikview", "gpa", "access", "gpa"), 
          c("report", "qlikview", "gpa", "access", "access", "gpa", 
            "access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", 
            "gpa", "access", "qlikview", "gpa", "access"), 
          c("report", "qlikview", 
            "access", "access", "report", "qlikview", "access", "access", 
            "gpa", "qlikview", "access", "access", "qlikview", "access", 
            "access"), 
          c("report", "report", "access", "access", "report", 
            "report", "report", "report", "report", "report", "data", "data", 
            "report", "access", "report", "report"), 
          c("report", "qlikview", 
            "gpa", "access", "gpa", "access"), 
          c("report", "gpa", "access", 
            "access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa", 
            "gpa", "gpa", "report"), 
          c("report", "gpa", "gpa", "access", 
            "gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa", 
            "gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa")) 
O <- as.data.frame(cbind(text))

rule <- function(out) {   
  vapply(out$text, function (row) {
    if ("access" %in% row && ("gpa" %in% row || "qlikview" %in% row)) {
      return("Access")
    } else if ("report" %in% row && ("pfi" %in% row || "data" %in% row)) {
      return("Report/Data")
    } else {
      return("NA")
    }   
  }, "") 
}

rule(O)
#  [1] "Access"      "Access"      "Access"      "Access"      "Access"      "Access"      "Report/Data" "Access"     
#  [9] "Access"      "Access"


来源:https://stackoverflow.com/questions/29669721/r-building-text-classifier

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!