I have an array of 800 sentences. I want to remove all duplicates (sentences that have the same exact words, but in different order) from the array. So for example \"this is
Here's something to try. I didn't test its performance on large arrays, but I think it should be ok. No jQuery needed.
function removeDuplicates(array)
{
var new_array = [];
for(var i=0; i<array.length; i++){
// convert current sentence to sorted, lowercase string
var sen = array[i].split(" ").sort().join(" ");
if(new_array.indexOf(sen) == -1){
// no matches, let's add it!
new_array.push(sen);
}
}
return new_array;
}
Array.prototype.indexOf = function(item, optional_start_index)
{
for(var i=optional_start_index||0; i<this.length; i++){
if(this[i] == item) return i;
}
return -1;
}
Use it like this:
var a = ["this is a name", "name is this a", "this name is a", "hello there"];
var clean_array = removeDuplicates(a);
alert(clean_array); // outputs: a is name this,hello there
This is a very simple implementation that takes advantage of some jQuery.
Check the demo here ->
And the source:
var arr = ["This is a sentence", "Is this a sentence", "potatoes"];
var newArr = [];
var sortedArr = [];
$.each(arr, function(i) {
var temp = this.toLowerCase().split(" ").sort(function(a,b) {
return a > b;
}).join(' ');
if ($.inArray(temp, sortedArr) == -1) {
sortedArr.push(temp);
newArr.push(arr[i]);
}
});
//output
$.each(newArr, function() {
document.write(this + '<br />');
});
It uses three arrays: a source, a collection of sorted sentences to match against, and the output array. Matching is performed by splitting a sentence by spaces, converting to lowercase, and sorting the words alphabetically, then rebuilding the sentence string. If that particular combo has been seen before, it is not added to the results. If it has not, it is added.
The loop at the end simply outputs the resultant array.
- JavaScript similar_text Calculates the similarity between two strings
Use an Object as a lookup to get a quick hashtable-backed check. That means using string as your key type, which means normalising the case/ordering/etc of the words first to get a unique key for each combination of words.
// Get key for sentence, removing punctuation and normalising case and word order
// eg 'Hello, a horse!' -> 'x_a hello horse'
// the 'x_' prefix is to avoid clashes with any object properties with undesirable
// special behaviour (like prototype properties in IE) and get a plain lookup
//
function getSentenceKey(sentence) {
var trimmed= sentence.replace(/^\s+/, '').replace(/\s+$/, '').toLowerCase();
var words= trimmed.replace(/[^\w\s]+/g, '').replace(/\s+/, ' ').split(' ');
words.sort();
return 'x_'+words.join(' ');
}
var lookup= {};
for (var i= sentences.length; i-->0;) {
var key= getSentenceKey(sentences[i]);
if (key in lookup)
sentences.splice(i, 1);
else
lookup[key]= true;
}
Would need some work if you need to support non-ASCII characters (\w
doesn't play well with Unicode in JS, and the question of what constitutes a word in some languages is a difficult one). Also, is "foo bar foo" the same sentence as "bar bar foo"?
Sort the array of sentences, and then loop through it and delete an item if it is the same as the previous one:
texts.sort();
for(var i = 1; i < texts.length; i++){
if(texts[i] === texts[i-1]){
texts.splice(i,1);
i--;
}
}
I tested this in an array with 800 strings, and it seemed reasonably fast.
EDIT: Sorry, didn't read your question very carefully
How about this?
va = ["this is a sentence", "sentence this is", "sentence this is a"]
vb = {} // dictionary of combined sorted words in each sentence
vc = [] // output list of sentences without duplicates
for (i=0;i<va.length;i++){
// split sentence, sort words, and recombine (this is a sentence => a is sentence this)
var combined = va[i].split(" ").sort().join(" ");
if (!vb[combined]){ // if set of combined sorted words doesn't exist already
vc.push(va[i]); // sentence isn't duplicated, push to output list
vb[combined] = true // add set to dictionary
}
}
alert(vc.join("\n"))