I have a large data set, dedicated to biological journals, which was being composed for a long time by different people. So, the data are not in a single format. For example, in
This extends @joshua-daly 's excellent response in order to accomplish two useful goals.
(1) Finding permutations of names with n>2 words (eg. Robert Allen Zimmerman aka Bob Dylan)
(2) Performing searches defined over fewer than all names on record (eg. Bob Dylan).
library(gtools)
x <- c("Yoda","speaks","thus")
permutations(n=3, r=3, v=x, repeats.allowed = FALSE) # n=num.elems r=num.times v=x
# generate some random names
names <- c(
"John Smith",
"Robert Allen Zimmerman (Bob Dylan)",
"Everette Camille Arron",
"Valentina Riquelme Molina",
"Smith J",
"Smith John",
"John S",
"John Sally"
);
# drop parentheses, if any
names <- gsub("[(|)]", "", names)
# split those names and get all ways to write that name into a list of same length
split_names <- lapply(
X = gsub("[(|)]", "", names),
FUN = function(x){
print(x);
# split by a space
c_split = unlist(x = strsplit(x = x, split = " "));
# get all permutations of c_split to compensate for order
n <- r <- length(c_split)
c_splits <- list(permutations(n=n, r=r, v=c_split, repeats.allowed = FALSE))
# return c_splits
c_splits;
}
)
split_names
# suppose we're looking for this name
search_for <- "Bob Dylan";
# split it by " " and then find all ways to write that name
search_for_split <- unlist(x = strsplit(x = search_for, split = " "));
# permutations over search_for_split seem redundant
# initialize a vector containing if search_for was matched in names
match_statuses <- c();
# for each name that's been split
for(i in 1:length(names)){
# the match status for the current name
match_status <- FALSE;
# the current split name
c_split_name <- as.data.frame(split_names[[i]]);
# for each element in c_split_name
for(j in 1:nrow(c_split_name)){
# the current permutation of current split name
c_c_split_name <- as.matrix(c_split_name[j,]);
# will receive hits in name's words, one by one, in sequence
hits <- rep(0, 20) # length 20 should always be above max number of words in names
# for each element in search_for_split
for(k in 1:length(search_for_split)){
# the current permutation of name
c_search_for_split <- search_for_split[[k]];
# L first hits will receive hit counts
L <- min(ncol(c_c_split_name), length(search_for_split));
# will match as many words as the shortest current pair of names
for(l in 1:L){
# if there's a match, the length of grep is greater than zero
if(
# is c_search_for_split in c_c_split_name's lth element
length(
grep(
pattern = c_search_for_split,
x = as.character(c_c_split_name[l])
)
) > 0 ||
# or, is c_c_split_name's lth element in c_search_for_split
length(
grep(
pattern = c_c_split_name[l],
x = c_search_for_split
)
) > 0
# if this is the case, record a hit
){
hits[l] <- 1;
} else {
# otherwise, don't update hit
}
}
}
# take L first elements
hits <- hits[1:L]
# if hits vector has all ones for this permutation, update match status to TRUE
if(
sum(hits)/length(hits)==1 # <- can/should be made more flexible (agrep, or sum/length<1)
){
match_status <- TRUE;
} else {
# otherwise, don't update match status
}
}
# append match_status to the match_statuses list
match_statuses <- c(match_statuses, match_status);
}
search_for;
cbind(names, match_statuses);