The question is a little complex. The problem here is to get rid of duplicates and save the unique elements of array into another array with their original sequence.
As Thomas suggested in a comment, if each element of the array is guaranteed to be from a limited set of values (such as a char
) you can achieve this in O(n)
time.
bool
(or int
if your compiler doesn't support bool
) or however many different discrete values could possibly be in the array. Initialize all the values to false
.bool
array is false
, add it to the output array and set the bool
array value to true
. Otherwise, do nothing.Well, here is a version for char
types. Note it doesn't scale.
#include "stdio.h"
#include "string.h"
void removeDuplicates(unsigned char *string)
{
unsigned char allCharacters [256] = { 0 };
int lookAt;
int writeTo = 0;
for(lookAt = 0; lookAt < strlen(string); lookAt++)
{
if(allCharacters[ string[lookAt] ] == 0)
{
allCharacters[ string[lookAt] ] = 1; // mark it seen
string[writeTo++] = string[lookAt]; // copy it
}
}
string[writeTo] = '\0';
}
int main()
{
char word[] = "abbbcdefbbbghasffffdaiouasdf";
removeDuplicates(word);
printf("Word is now [%s]\n", word);
return 0;
}
The following is the output:
Word is now [abcdefghsiou]
Is that something like what you want? You can modify the method if there are spaces between the letters, but if you use int
, float
, double
or char *
as the types, this method won't scale at all.
EDIT
I posted and then saw your clarification, where it's an array of char *
. I'll update the method.
I hope this isn't too much code. I adapted this QuickSort algorithm and basically added index memory to it. The algorithm is O(n log n), as the 3 steps below are additive and that is the worst case complexity of 2 of them.
originalIndices
holds the original index of the i'th element of the sorted array.NULL
, and setting the index value to elements
, which is the highest any can be.NULL
.Code:
#include "stdio.h"
#include "string.h"
#include "stdlib.h"
void sortArrayAndSetCriteria(char **arr, int elements, int *originalIndices)
{
#define MAX_LEVELS 1000
char *piv;
int beg[MAX_LEVELS], end[MAX_LEVELS], i=0, L, R;
int idx, cidx;
for(idx = 0; idx < elements; idx++)
originalIndices[idx] = idx;
beg[0] = 0;
end[0] = elements;
while (i>=0)
{
L = beg[i];
R = end[i] - 1;
if (L<R)
{
piv = arr[L];
cidx = originalIndices[L];
if (i==MAX_LEVELS-1)
return;
while (L < R)
{
while (strcmp(arr[R], piv) >= 0 && L < R) R--;
if (L < R)
{
arr[L] = arr[R];
originalIndices[L++] = originalIndices[R];
}
while (strcmp(arr[L], piv) <= 0 && L < R) L++;
if (L < R)
{
arr[R] = arr[L];
originalIndices[R--] = originalIndices[L];
}
}
arr[L] = piv;
originalIndices[L] = cidx;
beg[i + 1] = L + 1;
end[i + 1] = end[i];
end[i++] = L;
}
else
{
i--;
}
}
}
int removeDuplicatesFromBoth(char **arr, int elements, int *originalIndices)
{
// now remove duplicates
int i = 1, newLimit = 1;
char *curr = arr[0];
while (i < elements)
{
if(strcmp(curr, arr[i]) == 0)
{
arr[i] = NULL; // free this if it was malloc'd
originalIndices[i] = elements; // place it at the end
}
else
{
curr = arr[i];
newLimit++;
}
i++;
}
return newLimit;
}
void sortArrayBasedOnCriteria(char **arr, int elements, int *originalIndices)
{
#define MAX_LEVELS 1000
int piv;
int beg[MAX_LEVELS], end[MAX_LEVELS], i=0, L, R;
int idx;
char *cidx;
beg[0] = 0;
end[0] = elements;
while (i>=0)
{
L = beg[i];
R = end[i] - 1;
if (L<R)
{
piv = originalIndices[L];
cidx = arr[L];
if (i==MAX_LEVELS-1)
return;
while (L < R)
{
while (originalIndices[R] >= piv && L < R) R--;
if (L < R)
{
arr[L] = arr[R];
originalIndices[L++] = originalIndices[R];
}
while (originalIndices[L] <= piv && L < R) L++;
if (L < R)
{
arr[R] = arr[L];
originalIndices[R--] = originalIndices[L];
}
}
arr[L] = cidx;
originalIndices[L] = piv;
beg[i + 1] = L + 1;
end[i + 1] = end[i];
end[i++] = L;
}
else
{
i--;
}
}
}
int removeDuplicateStrings(char *words[], int limit)
{
int *indices = (int *)malloc(limit * sizeof(int));
int newLimit;
sortArrayAndSetCriteria(words, limit, indices);
newLimit = removeDuplicatesFromBoth(words, limit, indices);
sortArrayBasedOnCriteria(words, limit, indices);
free(indices);
return newLimit;
}
int main()
{
char *words[] = { "abc", "def", "bad", "hello", "captain", "def", "abc", "goodbye" };
int newLimit = removeDuplicateStrings(words, 8);
int i = 0;
for(i = 0; i < newLimit; i++) printf(" Word @ %d = %s\n", i, words[i]);
return 0;
}
O(n)
operationO(log n)
operationFinally, O(n log n)
operation
i think that in C you can create a second array. then you copy the element from the original array only if this element is not already in the send array. this also preserve the order of the element.
if you read the element one by one you can discard the element before insert in the original array, this could speedup the process.
You know how to do it for char type, right? You can do same thing with strings, but instead of using array of bools (which is technically an implementation of "set" object), you'll have to simulate the "set"(or array of bools) with a linear array of strings you already encountered. I.e. you have an array of strings you already saw, for each new string you check if it is in array of "seen" strings, if it is, then you ignore it (not unique), if it is not in array, you add it to both array of seen strings and output. If you have a small number of different strings (below 1000), you could ignore performance optimizations, and simply compare each new string with everything you already saw before.
With large number of strings (few thousands), however, you'll need to optimize things a bit:
1) Every time you add a new string to an array of strings you already saw, sort the array with insertion sort algorithm. Don't use quickSort, because insertion sort tends to be faster when data is almost sorted.
2) When checking if string is in array, use binary search.
If number of different strings is reasonable (i.e. you don't have billions of unique strings), this approach should be fast enough.