optimize buffer indexing, add sync mode, add options for tweaking it

2025-05-19 04:09:37 +00:00 · 2021-11-02 13:04:10 +02:00 · 2021-11-02 13:04:10 +02:00 · 846c7230ff
commit 846c7230ff
parent d1ca295ce5
2 changed files with 186 additions and 82 deletions
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@ -1,10 +1,14 @@
 ---@class cmp_buffer.Buffer
 ---@field public bufnr number
---@field public regexes any[]
+---@field public regex any
 ---@field public length number
 ---@field public pattern string
+---@field public indexing_chunk_size number
+---@field public indexing_interval number
 ---@field public timer any|nil
---@field public words table<number, string[]>
+---@field public lines_words table<number, string[]>
+---@field public unique_words table<string, boolean>
+---@field public unique_words_dirty boolean
 ---@field public processing boolean
 local buffer = {}

@ -12,15 +16,22 @@ local buffer = {}
 ---@param bufnr number
 ---@param length number
 ---@param pattern string
+---@param indexing_chunk_size number
+---@param indexing_interval number
 ---@return cmp_buffer.Buffer
-function buffer.new(bufnr, length, pattern)
+function buffer.new(bufnr, length, pattern, indexing_chunk_size, indexing_interval)
  local self = setmetatable({}, { __index = buffer })
  self.bufnr = bufnr
-  self.regexes = {}
+  self.regex = vim.regex(pattern)
  self.length = length
  self.pattern = pattern
+  self.indexing_chunk_size = indexing_chunk_size
+  self.indexing_interval = indexing_interval
  self.timer = nil
-  self.words = {}
+  self.lines_count = 0
+  self.lines_words = {}
+  self.unique_words = {}
+  self.unique_words_dirty = true
  self.processing = false
  return self
 end
@ -32,28 +43,75 @@ function buffer.close(self)
    self.timer:close()
    self.timer = nil
  end
-  self.words = {}
+  self.lines_count = 0
+  self.lines_words = {}
+  self.unique_words = {}
+  self.unique_words_dirty = false
 end

 ---Indexing buffer
 function buffer.index(self)
  self.processing = true
-  local index = 1
-  local lines = vim.api.nvim_buf_get_lines(self.bufnr, 0, -1, false)
+
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+  local chunk_max_size = self.indexing_chunk_size
+  if chunk_max_size < 1 then
+    -- Index all lines in one go.
+    chunk_max_size = self.lines_count
+  end
+  local chunk_start = 0
+
+  if self.indexing_interval <= 0 then
+    -- sync algorithm
+
+    vim.api.nvim_buf_call(self.bufnr, function()
+      while chunk_start < self.lines_count do
+        local chunk_end = math.min(chunk_start + chunk_max_size, self.lines_count)
+        -- For some reason requesting line arrays multiple times in chunks
+        -- leads to much better memory usage than doing that in one big array,
+        -- which is why the sync algorithm has better memory usage than the
+        -- async one.
+        local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
+        for linenr = chunk_start + 1, chunk_end do
+          self.lines_words[linenr] = {}
+          self:index_line(linenr, chunk_lines[linenr - chunk_start])
+        end
+        chunk_start = chunk_end
+      end
+    end)
+
+    self:rebuild_unique_words()
+
+    self.processing = false
+    return
+  end
+
+  -- async algorithm
+
+  local lines = vim.api.nvim_buf_get_lines(self.bufnr, 0, -1, true)
+  -- This flag prevents vim.schedule() callbacks from piling up in the queue
+  -- when the indexing interval is very short.
+  local scheduled = false
+
  self.timer = vim.loop.new_timer()
-  self.timer:start(
-    0,
-    200,
-    vim.schedule_wrap(function()
-      local chunk = math.min(index + 1000, #lines)
+  self.timer:start(0, self.indexing_interval, function()
+    if scheduled then
+      return
+    end
+    scheduled = true
+    vim.schedule(function()
+      scheduled = false
+
+      local chunk_end = math.min(chunk_start + chunk_max_size, self.lines_count)
      vim.api.nvim_buf_call(self.bufnr, function()
-        for i = index, chunk do
-          self:index_line(i, lines[i] or '')
+        for linenr = chunk_start + 1, chunk_end do
+          self.lines_words[linenr] = {}
+          self:index_line(linenr, lines[linenr])
        end
      end)
-      index = chunk + 1
+      chunk_start = chunk_end

-      if chunk >= #lines then
+      if chunk_end >= self.lines_count then
        if self.timer then
          self.timer:stop()
          self.timer:close()
@ -62,88 +120,127 @@ function buffer.index(self)
        self.processing = false
      end
    end)
-  )
+  end)
 end

+-- See below.
+local shared_marker_table_for_preallocation = {}
+
 --- watch
 function buffer.watch(self)
+  -- NOTE: As far as I know, indexing in watching can't be done asynchronously
+  -- because even built-in commands generate multiple consequent `on_lines`
+  -- events, and I'm not even mentioning plugins here. To get accurate results
+  -- we would have to either re-index the entire file on throttled events (slow
+  -- and looses the benefit of on_lines watching), or put the events in a
+  -- queue, which would complicate the plugin a lot. Plus, most changes which
+  -- trigger this event will be from regular editing, and so 99% of the time
+  -- they will affect only 1-2 lines.
  vim.api.nvim_buf_attach(self.bufnr, false, {
-    on_lines = vim.schedule_wrap(function(_, _, _, firstline, old_lastline, new_lastline, _, _, _)
-      if not vim.api.nvim_buf_is_valid(self.bufnr) then
-        self:close()
+    -- NOTE: line indexes are 0-based and the last line is not inclusive.
+    on_lines = function(_, _, _, first_line, old_last_line, new_last_line, _, _, _)
+      if not vim.api.nvim_buf_is_loaded(self.bufnr) then
        return true
      end

-      -- append
-      for i = old_lastline, new_lastline - 1 do
-        table.insert(self.words, i + 1, {})
-      end
-
-      -- remove
-      for _ = new_lastline, old_lastline - 1 do
-        table.remove(self.words, new_lastline + 1)
+      local delta = new_last_line - old_last_line
+      local new_lines_count = self.lines_count + delta
+      if delta > 0 then -- append
+        -- Explicitly reserve more slots in the array part of the lines table,
+        -- all of them will be filled in the next loop, but in reverse order
+        -- (which is why I am concerned about preallocation). Why is there no
+        -- built-in function to do this in Lua???
+        for i = self.lines_count + 1, new_lines_count do
+          self.lines_words[i] = shared_marker_table_for_preallocation
+        end
+        -- Move forwards the unchanged elements in the tail part.
+        for i = self.lines_count, old_last_line + 1, -1 do
+          self.lines_words[i + delta] = self.lines_words[i]
+        end
+        -- Fill in new tables for the added lines.
+        for i = old_last_line + 1, new_last_line do
+          self.lines_words[i] = {}
+        end
+      elseif delta < 0 then -- remove
+        -- Move backwards the unchanged elements in the tail part.
+        for i = old_last_line + 1, self.lines_count do
+          self.lines_words[i + delta] = self.lines_words[i]
+        end
+        -- Remove (already copied) tables from the end, in reverse order, so
+        -- that we don't make holes in the lines table.
+        for i = self.lines_count, new_lines_count + 1, -1 do
+          self.lines_words[i] = nil
+        end
      end
+      self.lines_count = new_lines_count

      -- replace lines
-      local lines = vim.api.nvim_buf_get_lines(self.bufnr, firstline, new_lastline, false)
+      local lines = vim.api.nvim_buf_get_lines(self.bufnr, first_line, new_last_line, true)
      vim.api.nvim_buf_call(self.bufnr, function()
        for i, line in ipairs(lines) do
-          if line then
-            self:index_line(firstline + i, line or '')
-          end
+          self:index_line(first_line + i, line)
        end
      end)
-    end),
+
+      self.unique_words_dirty = true
+    end,
+
+    on_detach = function(_)
+      self:close()
+    end,
  })
 end

 --- add_words
-function buffer.index_line(self, i, line)
-  local words = {}
+---@param linenr number
+---@param line string
+function buffer.index_line(self, linenr, line)
+  local words = self.lines_words[linenr]
+  for k, _ in ipairs(words) do
+    words[k] = nil
+  end
+  local word_i = 1

-  local buf = line
-  while true do
-    local s, e = self:matchstrpos(buf)
-    if s then
-      local word = string.sub(buf, s, e - 1)
+  local remaining = line
+  while #remaining > 0 do
+    -- NOTE: Both start and end indexes here are 0-based (unlike Lua strings),
+    -- and the end index is not inclusive.
+    local match_start, match_end = self.regex:match_str(remaining)
+    if match_start and match_end then
+      local word = remaining:sub(match_start + 1, match_end)
      if #word >= self.length then
-        table.insert(words, word)
+        words[word_i] = word
+        word_i = word_i + 1
      end
-    end
-    local new_buffer = string.sub(buf, e and e + 1 or 2)
-    if buf == new_buffer then
+      remaining = remaining:sub(match_end + 1)
+    else
      break
    end
-    buf = new_buffer
  end
-
-  self.words[i] = words
 end

 --- get_words
 function buffer.get_words(self)
-  local words = {}
-  for _, line in ipairs(self.words) do
+  -- NOTE: unique_words are rebuilt on-demand because it is common for the
+  -- watcher callback to be fired VERY frequently, and a rebuild needs to go
+  -- over ALL lines, not just the changed ones.
+  if self.unique_words_dirty then
+    self:rebuild_unique_words()
+  end
+  return self.unique_words
+end
+
+--- rebuild_unique_words
+function buffer.rebuild_unique_words(self)
+  for w, _ in pairs(self.unique_words) do
+    self.unique_words[w] = nil
+  end
+  for _, line in ipairs(self.lines_words) do
    for _, w in ipairs(line) do
-      table.insert(words, w)
+      self.unique_words[w] = true
    end
  end
-  return words
-end
-
--- matchstrpos
-function buffer.matchstrpos(self, text)
-  local s, e = self:regex(self.pattern):match_str(text)
-  if s == nil then
-    return nil, nil
-  end
-  return s + 1, e + 1
-end
-
--- regex
-function buffer.regex(self, pattern)
-  self.regexes[pattern] = self.regexes[pattern] or vim.regex(pattern)
-  return self.regexes[pattern]
+  self.unique_words_dirty = false
 end

 return buffer
--- a/lua/cmp_buffer/init.lua
+++ b/lua/cmp_buffer/init.lua
@ -6,6 +6,8 @@ local defaults = {
  get_bufnrs = function()
    return { vim.api.nvim_get_current_buf() }
  end,
+  indexing_chunk_size = 1000,
+  indexing_interval = 200,
 }

 local source = {}
@ -16,34 +18,37 @@ source.new = function()
  return self
 end

-source.get_keyword_pattern = function(_, params)
+source._validate_options = function(_, params)
  params.option = vim.tbl_deep_extend('keep', params.option, defaults)
  vim.validate({
-    keyword_length = { params.option.keyword_length, 'number', '`opts.keyword_length` must be `number`' },
-    keyword_pattern = { params.option.keyword_pattern, 'string', '`opts.keyword_pattern` must be `string`' },
-    get_bufnrs = { params.option.get_bufnrs, 'function', '`opts.get_bufnrs` must be `function`' },
+    keyword_length = { params.option.keyword_length, 'number' },
+    keyword_pattern = { params.option.keyword_pattern, 'string' },
+    get_bufnrs = { params.option.get_bufnrs, 'function' },
+    indexing_chunk_size = { params.option.indexing_chunk_size, 'number' },
+    indexing_interval = { params.option.indexing_interval, 'number' },
  })
+end
+
+source.get_keyword_pattern = function(self, params)
+  self:_validate_options(params)
  return params.option.keyword_pattern
 end

 source.complete = function(self, params, callback)
-  params.option = vim.tbl_deep_extend('keep', params.option, defaults)
-  vim.validate({
-    keyword_pattern = { params.option.keyword_pattern, 'string', '`opts.keyword_pattern` must be `string`' },
-    get_bufnrs = { params.option.get_bufnrs, 'function', '`opts.get_bufnrs` must be `function`' },
-  })
+  self:_validate_options(params)

  local processing = false
-  for _, buf in ipairs(self:_get_buffers(params)) do
+  local bufs = self:_get_buffers(params)
+  for _, buf in ipairs(bufs) do
    processing = processing or buf.processing
  end

-  vim.defer_fn(vim.schedule_wrap(function()
+  vim.defer_fn(function()
    local input = string.sub(params.context.cursor_before_line, params.offset)
    local items = {}
    local words = {}
-    for _, buf in ipairs(self:_get_buffers(params)) do
-      for _, word in ipairs(buf:get_words()) do
+    for _, buf in ipairs(bufs) do
+      for word, _ in pairs(buf:get_words()) do
        if not words[word] and input ~= word then
          words[word] = true
          table.insert(items, {
@ -58,7 +63,7 @@ source.complete = function(self, params, callback)
      items = items,
      isIncomplete = processing,
    })
-  end), processing and 100 or 0)
+  end, processing and 100 or 0)
 end

 --- _get_bufs
@ -69,7 +74,9 @@ source._get_buffers = function(self, params)
      local new_buf = buffer.new(
        bufnr,
        params.option.keyword_length,
-        params.option.keyword_pattern
+        params.option.keyword_pattern,
+        params.option.indexing_chunk_size,
+        params.option.indexing_interval
      )
      new_buf:index()
      new_buf:watch()