Error in writing data frame in R

百般思念 提交于 2019-12-20 05:58:03

问题


I'm trying to search a word from the text that I extract from the pdf file which is OCR'd format. This pdf file has multiple pages, so for each page, I'm searching that word, if that word is found then write the filename, status(Present or Not Present),Page on which it is found and what words it has found to a dataframe . But the dataframe is giving the status "Present" for all files, I just want like this

file_name       Status        Page              words
test1.pdf    "Present"       test1_2,test1_4    gym,school
test2.pdf    "Not Present"     -                 -
test3.pdf    "Present"       test3_1            gym

what m I missing in this code.

here is the code

    All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

  cnt <- pdf_info(All_files[i])$pages
  print(cnt)

  for(j in seq_len(cnt)){
    img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
    text <- ocr(img_file)
    ocr_text <- capture.output(cat(text))
    check <- sapply(ocr_text, paste, collapse="")
    junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
    file.remove(junk)
    br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
    else "Present" 
    print(br)       
    if(br=="Present") {
      v1[i] <- j
      break}

    for(k in chk_words){ 
      br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
      if(br == "Present")
        ps=k
      x[[k]]=ps
      tc=unlist(unique(x))
    }




  }

  print(tc)
  Status <- if(v1[i] == 0) "Not Present" else "Present"
  pages <- if(v1[i] == 0) "-" else 
    paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
  words <- if(v1[i] == 0) "-" else word
  df <- rbind(df, cbind(file_name = basename(file_name),
                        Status, pages = pages, words = words,tc))


}

Any suggestion is appreciable.

Thanks


回答1:


Here is an option for single word

v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"

for (i in seq_along(All_files)){


  file_name <- All_files[i]

    cnt <- pdf_info(All_files[i])$pages
    print(cnt)

    for(j in seq_len(cnt)){
      img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
      text <- ocr(img_file)
      ocr_text <- capture.output(cat(text))
      check <- sapply(ocr_text, paste, collapse="")
      junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
      file.remove(junk)
      br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"  
              else "Present" 
      print(br)       
      if(br=="Present") {
         v1[i] <- j
         break}

    }

    Status <- if(v1[i] == 0) "Not Present" else "Present"
    pages <- if(v1[i] == 0) "-" else 
     paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
    words <- if(v1[i] == 0) "-" else word
    df <- rbind(df, cbind(file_name = basename(file_name),
              Status, pages = pages, words = words))


}

-output

df
#     file_name      Status  pages  words
#1 Amenities.pdf Not Present      -      -
#2      test.pdf     Present test_2 school


来源:https://stackoverflow.com/questions/46398105/error-in-writing-data-frame-in-r

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!