问题
I'm trying to search a word from the text that I extract from the pdf file which is OCR'd format. This pdf file has multiple pages, so for each page, I'm searching that word, if that word is found then write the filename, status(Present or Not Present),Page on which it is found and what words it has found to a dataframe . But the dataframe is giving the status "Present" for all files, I just want like this
file_name Status Page words
test1.pdf "Present" test1_2,test1_4 gym,school
test2.pdf "Not Present" - -
test3.pdf "Present" test3_1 gym
what m I missing in this code.
here is the code
All_files=Sys.glob("*.pdf")
v1 <- numeric(length(All_files))
chk_words=c("Swimming pool","Gym","west","para")
word <- "Gym"
tc=c()
ps=c()
x=list()
df <- data.frame()
Status="Present"
for (i in seq_along(All_files)){
file_name <- All_files[i]
cnt <- pdf_info(All_files[i])$pages
print(cnt)
for(j in seq_len(cnt)){
img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
text <- ocr(img_file)
ocr_text <- capture.output(cat(text))
check <- sapply(ocr_text, paste, collapse="")
junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff")
file.remove(junk)
br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"
else "Present"
print(br)
if(br=="Present") {
v1[i] <- j
break}
for(k in chk_words){
br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")}
if(br == "Present")
ps=k
x[[k]]=ps
tc=unlist(unique(x))
}
}
print(tc)
Status <- if(v1[i] == 0) "Not Present" else "Present"
pages <- if(v1[i] == 0) "-" else
paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
words <- if(v1[i] == 0) "-" else word
df <- rbind(df, cbind(file_name = basename(file_name),
Status, pages = pages, words = words,tc))
}
Any suggestion is appreciable.
Thanks
回答1:
Here is an option for single word
v1 <- numeric(length(All_files))
word <- "school"
df <- data.frame()
Status="Present"
for (i in seq_along(All_files)){
file_name <- All_files[i]
cnt <- pdf_info(All_files[i])$pages
print(cnt)
for(j in seq_len(cnt)){
img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400)
text <- ocr(img_file)
ocr_text <- capture.output(cat(text))
check <- sapply(ocr_text, paste, collapse="")
junk <- dir(path= paste0(path, "/tiff"), pattern="tiff")
file.remove(junk)
br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present"
else "Present"
print(br)
if(br=="Present") {
v1[i] <- j
break}
}
Status <- if(v1[i] == 0) "Not Present" else "Present"
pages <- if(v1[i] == 0) "-" else
paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i])
words <- if(v1[i] == 0) "-" else word
df <- rbind(df, cbind(file_name = basename(file_name),
Status, pages = pages, words = words))
}
-output
df
# file_name Status pages words
#1 Amenities.pdf Not Present - -
#2 test.pdf Present test_2 school
来源:https://stackoverflow.com/questions/46398105/error-in-writing-data-frame-in-r