I have a dictionary of 50K to 100K strings (can be up to 50+ characters) and I am trying to find whether a given string is in the dictionary with some \"edit\" distance toleranc
Here is raw Trie-like implementation. It is totally not optimized, just a proof of concept. Pure Ruby implementation.
To test it I took 100_000 words from here http://www.infochimps.com/datasets/word-list-100000-official-crossword-words-excel-readable/downloads/195488
here is a gist for it https://gist.github.com/fl00r/7542994
class TrieDict
attr_reader :dict
def initialize
@dict = {}
end
def put(str)
d = nil
str.chars.each do |c|
d && (d = (d[1][c] ||= [nil, {}])) || d = (@dict[c] ||= [nil, {}])
end
d[0] = true
end
def fetch(prefix, fuzzy = 0)
storage = []
str = ""
error = 0
recur_fetch(prefix, fuzzy, @dict, storage, str, error)
storage
end
def recur_fetch(prefix, fuzzy, dict, storage, str, error)
dict.each do |k, vals|
e = error
if prefix[0] != k
e += 1
next if e > fuzzy
end
s = str + k
storage << s if vals[0] && (prefix.size - 1) <= (fuzzy - e)
recur_fetch(prefix[1..-1] || "", fuzzy, vals[1], storage, s, e)
end
end
end
def bench
t = Time.now.to_f
res = nil
10.times{ res = yield }
e = Time.now.to_f - t
puts "Elapsed for 10 times: #{e}"
puts "Result: #{res}"
end
trie = TrieDict.new
File.read("/home/petr/code/tmp/words.txt").each_line do |word|
trie.put(word.strip)
end; :ok
# Elapsed for 10 times: 0.0006465911865234375
# Result: ["hello"]
bench{ trie.fetch "hello", 1 }
# Elapsed for 10 times: 0.013643741607666016
# Result: ["cello", "hallo", "helio", "hell", "hello", "hellos", "hells", "hillo", "hollo", "hullo"]
bench{ trie.fetch "hello", 2 }
# Elapsed for 10 times: 0.08267641067504883
# Result: ["bell", "belle", "bellow", "bells", "belly", "cell", "cella", "celli", "cello", "cellos", "cells", "dell", "dells", "delly", "fell", "fella", "felloe", "fellow", "fells", "felly", "hall", "hallo", "halloa", "halloo", "hallos", "hallot", "hallow", "halls", "heal", "heals", "heel", "heels", "heil", "heils", "held", "helio", "helios", "helix", "hell", "helled", "heller", "hello", "helloed", "helloes", "hellos", "hells", "helm", "helms", "helot", "help", "helps", "helve", "herl", "herls", "hill", "hillo", "hilloa", "hillos", "hills", "hilly", "holla", "hollo", "holloa", "holloo", "hollos", "hollow", "holly", "hull", "hullo", "hulloa", "hullos", "hulls", "jell", "jells", "jelly", "mell", "mellow", "mells", "sell", "selle", "sells", "tell", "tells", "telly", "well", "wells", "yell", "yellow", "yells"]
bench{ trie.fetch "engineer", 2 }
# Elapsed for 10 times: 0.04654884338378906
# Result: ["engender", "engine", "engined", "engineer", "engineered", "engineers", "enginery", "engines"]
bench{ trie.fetch "engeneer", 1 }
# Elapsed for 10 times: 0.005484580993652344
# Result: ["engender", "engineer"]