110 lines
2.4 KiB
Lua
110 lines
2.4 KiB
Lua
local NGram = {}
|
|
NGram.__index = NGram
|
|
|
|
function NGram:new(opts)
|
|
-- TODO: Add padding
|
|
opts = opts or {}
|
|
return setmetatable({
|
|
N = opts.N or 2,
|
|
split = opts.split or "/",
|
|
_depth = 5,
|
|
_grams = setmetatable({}, utils.default_table_mt)
|
|
}, self)
|
|
end
|
|
|
|
local min = math.min
|
|
|
|
function NGram:_split(word)
|
|
local word_len = #word
|
|
|
|
local result = {}
|
|
for i = 1, word_len - 1 do
|
|
-- for j = i + (self.N - 1), min(i + self._depth - 1, word_len) do
|
|
-- table.insert(result, string.sub(word, i, j))
|
|
-- end
|
|
table.insert(result, string.sub(word, i, i + self.N - 1))
|
|
end
|
|
|
|
return result
|
|
end
|
|
|
|
-- local function pairsByKeys (t, f)
|
|
-- local a = {}
|
|
-- for n in pairs(t) do table.insert(a, n) end
|
|
-- table.sort(a, f)
|
|
-- local i = 0 -- iterator variable
|
|
-- local iter = function () -- iterator function
|
|
-- i = i + 1
|
|
-- if a[i] == nil then return nil
|
|
-- else return a[i], t[a[i]]
|
|
-- end
|
|
-- end
|
|
-- return iter
|
|
-- end
|
|
|
|
function NGram:add(word)
|
|
local split_word = self:_split(word)
|
|
|
|
for _, k in ipairs(split_word) do
|
|
local counts = self._grams[k]
|
|
if counts[word] == nil then
|
|
counts[word] = 0
|
|
end
|
|
|
|
counts[word] = counts[word] + 1
|
|
end
|
|
end
|
|
|
|
function NGram:_items_sharing_ngrams(query)
|
|
local split_query = self:_split(query)
|
|
|
|
-- Matched string to number of N-grams shared with the query string.
|
|
local shared = {}
|
|
|
|
local remaining = {}
|
|
|
|
for _, ngram in ipairs(split_query) do
|
|
remaining = {}
|
|
for match, count in pairs(self._grams[ngram] or {}) do
|
|
remaining[match] = remaining[match] or count
|
|
|
|
if remaining[match] > 0 then
|
|
remaining[match] = remaining[match] - 1
|
|
shared[match] = (shared[match] or 0) + 1
|
|
end
|
|
end
|
|
end
|
|
|
|
return shared
|
|
end
|
|
|
|
function NGram:search(query, show_values)
|
|
local sharing_ngrams = self:_items_sharing_ngrams(query)
|
|
|
|
local results = {}
|
|
for name, count in pairs(sharing_ngrams) do
|
|
local allgrams = #query + #name - (2 * self.N) - count + 2
|
|
table.insert(results, {name, count / allgrams})
|
|
end
|
|
|
|
table.sort(results, function(left, right)
|
|
return left[2] > right[2]
|
|
end)
|
|
|
|
if not show_values then
|
|
for k, v in ipairs(results) do
|
|
results[k] = v[1]
|
|
end
|
|
end
|
|
|
|
return results
|
|
end
|
|
|
|
function NGram:find(query)
|
|
return self:search(query)[1]
|
|
end
|
|
|
|
function NGram:score(query)
|
|
return (self:search(query, true)[1] or {})[2] or 0
|
|
end
|