I have a large string and a list of search strings and want to build a boolean list indicating whether or not each of the search strings exists in the large string. What is the
I post it just for comparison. My comparing code:
#!/usr/bin/env python3
def gettext():
from os import scandir
l = []
for file in scandir('.'):
if file.name.endswith('.txt'):
l.append(open(file.name).read())
return ' '.join(l)
def getsearchterms():
return list(set(open('searchterms').read().split(';')))
def rob(search_string, input_string):
import re
return [any(l)
for l in
zip(*re.findall('|'.join('('+i+')' for i in search_string),
input_string))]
def blotosmetek(search_strings, input_string):
import re
regexp = re.compile('|'.join([re.escape(x) for x in search_strings]))
found = set(regexp.findall(input_string))
return [x in found for x in search_strings]
def ahocorasick(search_list, input):
import ahocorasick
import numpy as np
A = ahocorasick.Automaton()
for idx, s in enumerate(search_list):
A.add_word(s, (idx, s))
A.make_automaton()
index_list = []
for item in A.iter(input):
index_list.append(item[1][0])
output_list = np.array([0] * len(search_list))
output_list[index_list] = 1
return output_list.tolist()
def naive(search_list, text):
return [s in text for s in search_list]
def test(fn, args):
start = datetime.now()
ret = fn(*args)
end = datetime.now()
return (end-start).total_seconds()
if __name__ == '__main__':
from datetime import datetime
text = gettext()
print("Got text, total of", len(text), "characters")
search_strings = getsearchterms()
print("Got search terms, total of", len(search_strings), "words")
fns = [ahocorasick, blotosmetek, naive, rob]
for fn in fns:
r = test(fn, [search_strings, text])
print(fn.__name__, r*1000, "ms")
I used different words that appear in Leviathan as search terms and concatenated 25 most downloaded books from Project Gutenberg as search string. Results are as follows:
Got text, total of 18252025 characters
Got search terms, total of 12824 words
ahocorasick 3824.111 milliseconds
Błotosmętek 360565.542 milliseconds
naive 73765.67 ms
Robs version runs already for about an hour and still doesn't finish. Maybe it's broken, maybe it's simply painfully slow.