feat: new fuzzy sorter

2020-09-01 22:00:55 -04:00
parent 4ac50c68ca
commit c11a661362
6 changed files with 212 additions and 46 deletions
--- a/lua/telescope/builtin.lua
+++ b/lua/telescope/builtin.lua
@@ -29,12 +29,14 @@ builtin.git_files = function(opts)
    end)

    or nil
+
  pickers.new(opts, {
    prompt    = 'Git File',
    finder    = finders.new_oneshot_job({ "git", "ls-files" }, make_entry),
    previewer = previewers.cat,
-    sorter    = sorters.get_norcalli_sorter(),
+    sorter    = sorters.get_fuzzy_file(),
  }):find()
+
 end

 builtin.live_grep = function(opts)
--- a/lua/telescope/pickers.lua
+++ b/lua/telescope/pickers.lua
@@ -276,6 +276,8 @@ function Picker:find()
    end

    local process_complete = vim.schedule_wrap(function()
+      -- TODO: We should either: always leave one result or make sure we actually clean up the results when nothing matches
+
      if selection_strategy == 'row' then
        self:set_selection(self:get_selection_row())
      elseif selection_strategy == 'follow' then
--- a/lua/telescope/sorters.lua
+++ b/lua/telescope/sorters.lua
@@ -73,13 +73,138 @@ end

 -- TODO: Match on upper case words
 -- TODO: Match on last match
-sorters.get_fuzzy_file = function()
-  local cached_tails = {}
+sorters.get_fuzzy_file = function(opts)
+  opts = opts or {}
+
+  local ngram_len = opts.ngram_len or 2
+  local os_sep = '/'
+
+  local cached_tails = setmetatable({}, {
+    __index = function(t, k)
+      local tail_split = vim.split(k, os_sep)
+      local tail = tail_split[#tail_split]
+
+      rawset(t, k, tail)
+      return tail
+    end,
+  })
+
+  local cached_uppers = setmetatable({}, {
+    __index = function(t, k)
+      local obj = {}
+      for i = 1, #k do
+        local s = k:sub(i, i)
+        local s_byte = s:byte()
+        if s_byte <= 90 and s_byte >= 65 then
+          obj[s] = true
+        end
+      end
+
+      rawset(t, k, obj)
+      return obj
+    end
+  })
+
  local cached_ngrams = {}

+  local function overlapping_ngrams(s, n)
+    if cached_ngrams[s] and cached_ngrams[s][n] then
+      return cached_ngrams[s][n]
+    end
+
+    local R = {}
+    for i = 1, s:len() - n + 1 do
+      R[#R+1] = s:sub(i, i+n-1)
+    end
+
+    if not cached_ngrams[s] then
+      cached_ngrams[s] = {}
+    end
+
+    cached_ngrams[s][n] = R
+
+    return R
+  end
+
  return Sorter:new {
    scoring_function = function(_, prompt, line)
-      return 1
+      local N = #prompt
+
+      if prompt == 0 or N < ngram_len then
+        -- TODO: If the character is in the line,
+        -- then it should get a point or somethin.
+        return 0
+      end
+
+      local prompt_lower = prompt:lower()
+      local line_lower = line:lower()
+
+      local prompt_lower_ngrams = overlapping_ngrams(prompt_lower, ngram_len)
+
+      -- Contains the original string
+      local contains_string = line_lower:find(prompt_lower, 1, true)
+
+      local prompt_uppers = cached_uppers[prompt]
+      local line_uppers = cached_uppers[line]
+
+      local uppers_matching = 0
+      for k, _ in pairs(prompt_uppers) do
+        if line_uppers[k] then
+          uppers_matching = uppers_matching + 1
+        end
+      end
+
+      -- TODO: Consider case senstivity
+      local tail = cached_tails[line_lower]
+      local contains_tail = tail:find(prompt, 1, true)
+
+      local consecutive_matches = 0
+      local previous_match_index = 0
+      local match_count = 0
+
+      for i = 1, #prompt_lower_ngrams do
+        local match_start = line_lower:find(prompt_lower_ngrams[i], 1, true)
+        if match_start then
+          match_count = match_count + 1
+          if match_start > previous_match_index then
+            consecutive_matches = consecutive_matches + 1
+          end
+
+          previous_match_index = match_start
+        end
+      end
+
+      local tail_modifier = 1
+      if contains_tail then
+        tail_modifier = 2
+      end
+
+      -- TODO: Copied from ashkan.
+      local denominator = (
+        (10 * match_count / #prompt_lower_ngrams)
+        -- biases for shorter strings
+        -- TODO(ashkan): this can bias towards repeated finds of the same
+        -- subpattern with overlapping_ngrams
+        + 3 * match_count * ngram_len / #line
+        + consecutive_matches
+        + N / (contains_string or (2 * #line))
+        -- + 30/(c1 or 2*N)
+
+        -- TODO: It might be possible that this too strongly correlates,
+        --          but it's unlikely for people to type capital letters without actually
+        --          wanting to do something with a capital letter in it.
+        + uppers_matching
+      ) * tail_modifier
+
+      if denominator == 0 or denominator ~= denominator then
+        return -1
+      end
+
+      if #prompt > 2 and denominator < 0.5 then
+        return -1
+      end
+
+      return 1 / denominator
    end
  }
 end