Input:
1) A huge sorted array of string SA;
2) A prefix string P;
Output:
The index of the first string matching the input prefix if any. If ther
It can be done in linear time using a Suffix Tree. Building the suffix tree takes linear time.
Here is a possible solution (in Python), which has O(k.log(n)) time complexity and O(1) additional space complexity (considering n strings and k prefix length).
The rationale behind it to perform a binary search which only considers a given character index of the strings. If these are present, continue to the next character index. If any of the prefix characters cannot be found in any string, it returns immediately.
from typing import List
def first(items: List[str], prefix: str, i: int, c: str, left: int, right: int):
result = -1
while left <= right:
mid = left + ((right - left) // 2)
if ( i >= len(items[mid]) ):
left = mid + 1
elif (c < items[mid][i]):
right = mid - 1
elif (c > items[mid][i]):
left = mid + 1
else:
result = mid
right = mid - 1
return result
def last(items: List[str], prefix: str, i: int, c: str, left: int, right: int):
result = -1
while left <= right:
mid = left + ((right - left) // 2)
if ( i >= len(items[mid]) ):
left = mid + 1
elif (c < items[mid][i]):
right = mid - 1
elif (c > items[mid][i]):
left = mid + 1
else:
result = mid
left = mid + 1
return result
def is_prefix(items: List[str], prefix: str):
left = 0
right = len(items) - 1
for i in range(len(prefix)):
c = prefix[i]
left = first(items, prefix, i, c, left, right)
right = last(items, prefix, i, c, left, right)
if (left == -1 or right == -1):
return False
return True
# Test cases
a = ['ab', 'abjsiohjd', 'abikshdiu', 'ashdi','abcde Aasioudhf', 'abcdefgOAJ', 'aa', 'aaap', 'aas', 'asd', 'bbbbb', 'bsadiojh', 'iod', '0asdn', 'asdjd', 'bqw', 'ba']
a.sort()
print(a)
print(is_prefix(a, 'abcdf'))
print(is_prefix(a, 'abcde'))
print(is_prefix(a, 'abcdef'))
print(is_prefix(a, 'abcdefg'))
print(is_prefix(a, 'abcdefgh'))
print(is_prefix(a, 'abcde Aa'))
print(is_prefix(a, 'iod'))
print(is_prefix(a, 'ZZZZZZiod'))
This gist is available at https://gist.github.com/lopespm/9790d60492aff25ea0960fe9ed389c0f