Merge pull request #23 from dmitmel/synchronous-indexing

Improve reliability of async indexing while the user is editing the file, implement the memory usage optimization for the indexer, make its speed configurable
2025-05-06 02:00:06 +00:00 · 2021-12-24 23:30:44 +09:00 · 2021-12-24 23:30:44 +09:00 · a01cfeca70
commit a01cfeca70
parent e26cdfb26f eba65f6fda
4 changed files with 235 additions and 77 deletions
--- a/README.md
+++ b/README.md
@ -19,7 +19,7 @@ The below source configuration are available. To set any of these options, do:
 ```lua
 cmp.setup({
  sources = {
-    { 
+    {
      name = 'buffer',
      option = {
        -- Options go into this table
@ -109,6 +109,20 @@ end
 ```


+### indexing_interval (type: number)
+
+_Default:_ `100`
+
+Advanced option. See the section [Indexing](#indexing).
+
+
+### indexing_batch_size (type: number)
+
+_Default:_ `1000`
+
+Advanced option. See the section [Indexing](#indexing).
+
+
 ## Locality bonus comparator (distance-based sorting)

 This source also provides a comparator function which uses information from the word indexer
@ -133,3 +147,52 @@ cmp.setup({
  }
 })
 ```
+
+
+## Indexing
+
+When a buffer is opened, this source first has to scan all lines in the buffer, match all words
+and store all of their occurrences. This process is called _indexing_. When actually editing the
+text in the buffer, the index of words is kept up-to-date with changes to the buffer's contents,
+this is called _watching_. It is done by re-running the indexer on just the changed lines.
+Indexing happens completely asynchronously in background, unlike watching, which must be performed
+synchronously to ensure that the index of words is kept perfectly in-sync with the lines in the
+buffer. However, most of the time this will not be a problem since many typical text edit
+operations affect only one or two lines, unless you are pasting a 1000-line snippet.
+
+_Note that you can freely edit the buffer while it is being indexed_, the underlying algorithm is
+written in such a way that your changes will not break the index or cause errors. If a crash does
+happen - it is a bug, so please report it.
+
+The speed of indexing is configurable with two options: `indexing_interval` and
+`indexing_batch_size`. Essentially, when indexing, a timer is started, which pulls a batch of
+`indexing_batch_size` lines from the buffer, scans them for words, and repeats after
+`indexing_interval` milliseconds. Decreasing interval and/or increasing the batch size will make
+the indexer faster, but at the expense of higher CPU usage and more lag when editing the file
+while indexing is still in progress. Setting `indexing_batch_size` to a negative value will switch
+the indexer to the "synchronous" mode: this will process all lines in one go, take less time in
+total (since no other code will be running on the Lua thread), but with the obvious downside that
+the editor UI will be blocked.
+
+### Performance on large text files
+
+This source has been tested on code files of a few megabytes in size (5-10) and contains
+optimizations for them, however, the indexed words can still take up tens of megabytes of RAM if
+the file is large. It also currently has troubles on files with very long lines, see issue
+[#13](https://github.com/hrsh7th/cmp-buffer/issues/13).
+
+So, if you wish to avoid accidentally running this source on big files, you can tweak
+`get_bufnrs`, for example like this:
+
+```lua
+get_bufnrs = function()
+  local buf = vim.api.nvim_get_current_buf()
+  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
+  if byte_size > 1024 * 1024 then -- 1 Megabyte max
+    return {}
+  end
+  return { buf }
+end
+```
+
+Of course, this snippet can be combined with any other recipes for `get_bufnrs`.
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@ -1,11 +1,18 @@
+local timer = require('cmp_buffer.timer')
+
+local function clear_table(tbl)
+  for k in pairs(tbl) do
+    tbl[k] = nil
+  end
+end
+
 ---@class cmp_buffer.Buffer
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
 ---@field public regex any
---@field public indexing_chunk_size number
---@field public indexing_interval number
---@field public timer any|nil
+---@field public timer cmp_buffer.Timer
 ---@field public lines_count number
+---@field public timer_current_line number
 ---@field public lines_words table<number, string[]>
 ---@field public unique_words_curr_line table<string, boolean>
 ---@field public unique_words_other_lines table<string, boolean>
@ -20,6 +27,10 @@
 ---@field public words_distances_dirty boolean
 local buffer = {}

+-- For some reason requesting this much lines multiple times in chunks leads to
+-- much better memory usage than fetching the entire file in one go.
+buffer.GET_LINES_CHUNK_SIZE = 1000
+
 ---Create new buffer object
 ---@param bufnr number
 ---@param opts cmp_buffer.Options
@ -28,16 +39,15 @@ function buffer.new(bufnr, opts)
  local self = setmetatable({}, { __index = buffer })

  self.bufnr = bufnr
-  self.timer = nil
+  self.timer = timer.new()
  self.closed = false
  self.on_close_cb = nil

  self.opts = opts
  self.regex = vim.regex(self.opts.keyword_pattern)
-  self.indexing_chunk_size = 1000
-  self.indexing_interval = 200

  self.lines_count = 0
+  self.timer_current_line = -1
  self.lines_words = {}

  self.unique_words_curr_line = {}
@ -58,8 +68,11 @@ end
 function buffer.close(self)
  self.closed = true
  self:stop_indexing_timer()
+  self.timer:close()
+  self.timer = nil

  self.lines_count = 0
+  self.timer_current_line = -1
  self.lines_words = {}

  self.unique_words_curr_line = {}
@ -79,11 +92,8 @@ function buffer.close(self)
 end

 function buffer.stop_indexing_timer(self)
-  if self.timer and not self.timer:is_closing() then
-    self.timer:stop()
-    self.timer:close()
-  end
-  self.timer = nil
+  self.timer:stop()
+  self.timer_current_line = -1
 end

 function buffer.mark_all_lines_dirty(self)
@ -91,16 +101,7 @@ function buffer.mark_all_lines_dirty(self)
  self.unique_words_other_lines_dirty = true
  self.last_edit_first_line = 0
  self.last_edit_last_line = 0
-end
-
---Indexing buffer
-function buffer.index(self)
-  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-  for i = 1, self.lines_count do
-    self.lines_words[i] = {}
-  end
-
-  self:index_range_async(0, self.lines_count)
+  self.words_distances_dirty = true
 end

 --- Workaround for https://github.com/neovim/neovim/issues/16729
@ -112,48 +113,67 @@ function buffer.safe_buf_call(self, callback)
  end
 end

-function buffer.index_range(self, range_start, range_end)
+function buffer.index_range(self, range_start, range_end, skip_already_indexed)
  self:safe_buf_call(function()
-    local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
-    for i, line in ipairs(lines) do
-      self:index_line(range_start + i, line)
+    local chunk_size = self.GET_LINES_CHUNK_SIZE
+    local chunk_start = range_start
+    while chunk_start < range_end do
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
+      local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
+      for i, line in ipairs(chunk_lines) do
+        if not skip_already_indexed or not self.lines_words[chunk_start + i] then
+          self:index_line(chunk_start + i, line)
+        end
+      end
+      chunk_start = chunk_end
    end
  end)
 end

-function buffer.index_range_async(self, range_start, range_end)
-  local chunk_start = range_start
+function buffer.start_indexing_timer(self)
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+  self.timer_current_line = 0

-  local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
+  -- Negative values result in an integer overflow in luv (vim.loop), and zero
+  -- disables timer repeat, so only intervals larger than 1 are valid.
+  local interval = math.max(1, self.opts.indexing_interval)
+  self.timer:start(0, interval, function()
+    if self.closed then
+      self:stop_indexing_timer()
+      return
+    end

-  self.timer = vim.loop.new_timer()
-  self.timer:start(
-    0,
-    self.indexing_interval,
-    vim.schedule_wrap(function()
-      if self.closed then
-        return
-      end
+    -- Note that the async indexer is designed to not break even if the user is
+    -- editing the file while it is in the process of being indexed. Because
+    -- the indexing in watcher must use the synchronous algorithm, we assume
+    -- that the data already present in self.lines_words to be correct and
+    -- doesn't need refreshing here because even if we do receive text from
+    -- nvim_buf_get_lines different from what the watcher has seen so far, it
+    -- (the watcher) will catch up on the next on_lines event.

-      local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end)
-      self:safe_buf_call(function()
-        for linenr = chunk_start + 1, chunk_end do
-          self:index_line(linenr, lines[linenr])
-        end
-      end)
-      chunk_start = chunk_end
-      self:mark_all_lines_dirty()
-      self.words_distances_dirty = true
+    -- Skip over the already indexed lines
+    while self.lines_words[self.timer_current_line + 1] do
+      self.timer_current_line = self.timer_current_line + 1
+    end

-      if chunk_end >= range_end then
-        self:stop_indexing_timer()
-      end
-    end)
-  )
+    local batch_start = self.timer_current_line
+    local batch_size = self.opts.indexing_batch_size
+    -- NOTE: self.lines_count may be modified by the indexer.
+    local batch_end = batch_size >= 1 and math.min(batch_start + batch_size, self.lines_count) or self.lines_count
+    if batch_end >= self.lines_count then
+      self:stop_indexing_timer()
+    end
+    self.timer_current_line = batch_end
+    self:mark_all_lines_dirty()
+
+    self:index_range(batch_start, batch_end, true)
+  end)
 end

 --- watch
 function buffer.watch(self)
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+
  -- NOTE: As far as I know, indexing in watching can't be done asynchronously
  -- because even built-in commands generate multiple consequent `on_lines`
  -- events, and I'm not even mentioning plugins here. To get accurate results
@ -218,8 +238,24 @@ function buffer.watch(self)
      end
      self.lines_count = new_lines_count

-      -- replace lines
-      self:index_range(first_line, new_last_line)
+      -- This branch is support code for handling cases when the user is
+      -- editing the buffer while the async indexer is running. It solves the
+      -- problem that if new lines are inserted or old lines are deleted, the
+      -- indexes of each subsequent line will change, and so the indexer
+      -- current position must be adjusted to not accidentally skip any lines.
+      if self.timer:is_active() then
+        if first_line <= self.timer_current_line and self.timer_current_line < old_last_line then
+          -- The indexer was in the area of the current text edit. We will
+          -- synchronously index this area it in a moment, so the indexer
+          -- should resume from right after the edit range.
+          self.timer_current_line = new_last_line
+        elseif self.timer_current_line >= old_last_line then
+          -- The indexer was somewhere past the current text edit. This means
+          -- that the line numbers could have changed, and the indexing
+          -- position must be adjusted accordingly.
+          self.timer_current_line = self.timer_current_line + delta
+        end
+      end

      if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
        self.unique_words_curr_line_dirty = true
@ -231,6 +267,9 @@ function buffer.watch(self)
      self.last_edit_last_line = new_last_line

      self.words_distances_dirty = true
+
+      -- replace lines
+      self:index_range(first_line, new_last_line)
    end,

    on_reload = function(_, _)
@ -238,23 +277,10 @@ function buffer.watch(self)
        return true
      end

-      -- The logic for adjusting lines list on buffer reloads is much simpler
-      -- because tables of all lines can be assumed to be fresh.
-      local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-      if new_lines_count > self.lines_count then -- append
-        for i = self.lines_count + 1, new_lines_count do
-          self.lines_words[i] = {}
-        end
-      elseif new_lines_count < self.lines_count then -- remove
-        for i = self.lines_count, new_lines_count + 1, -1 do
-          self.lines_words[i] = nil
-        end
-      end
-      self.lines_count = new_lines_count
+      clear_table(self.lines_words)

-      self:index_range(0, self.lines_count)
-      self:mark_all_lines_dirty()
-      self.words_distances_dirty = true
+      self:stop_indexing_timer()
+      self:start_indexing_timer()
    end,

    on_detach = function(_, _)
@ -266,12 +292,6 @@ function buffer.watch(self)
  })
 end

-local function clear_table(tbl)
-  for k in pairs(tbl) do
-    tbl[k] = nil
-  end
-end
-
 ---@param linenr number
 ---@param line string
 function buffer.index_line(self, linenr, line)
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@ -4,6 +4,8 @@ local buffer = require('cmp_buffer.buffer')
 ---@field public keyword_length number
 ---@field public keyword_pattern string
 ---@field public get_bufnrs fun(): number[]
+---@field public indexing_batch_size number
+---@field public indexing_interval number

 ---@type cmp_buffer.Options
 local defaults = {
@ -12,6 +14,8 @@ local defaults = {
  get_bufnrs = function()
    return { vim.api.nvim_get_current_buf() }
  end,
+  indexing_batch_size = 1000,
+  indexing_interval = 100,
 }

 local source = {}
@ -29,6 +33,8 @@ source._validate_options = function(_, params)
    keyword_length = { opts.keyword_length, 'number' },
    keyword_pattern = { opts.keyword_pattern, 'string' },
    get_bufnrs = { opts.get_bufnrs, 'function' },
+    indexing_batch_size = { opts.indexing_batch_size, 'number' },
+    indexing_interval = { opts.indexing_interval, 'number' },
  })
  return opts
 end
@ -44,7 +50,7 @@ source.complete = function(self, params, callback)
  local processing = false
  local bufs = self:_get_buffers(opts)
  for _, buf in ipairs(bufs) do
-    if buf.timer then
+    if buf.timer:is_active() then
      processing = true
      break
    end
@ -76,6 +82,7 @@ source.complete = function(self, params, callback)
 end

 ---@param opts cmp_buffer.Options
+---@return cmp_buffer.Buffer[]
 source._get_buffers = function(self, opts)
  local buffers = {}
  for _, bufnr in ipairs(opts.get_bufnrs()) do
@ -84,7 +91,7 @@ source._get_buffers = function(self, opts)
      new_buf.on_close_cb = function()
        self.buffers[bufnr] = nil
      end
-      new_buf:index()
+      new_buf:start_indexing_timer()
      new_buf:watch()
      self.buffers[bufnr] = new_buf
    end
--- a/lua/cmp_buffer/timer.lua
+++ b/lua/cmp_buffer/timer.lua
@ -0,0 +1,68 @@
+---This timer matches the semantics of setInterval and clearInterval of
+---Javascript. It provides a more reliable alternative to vim.loop.timer_start
+---with a callback wrapped into a vim.schedule call by addressing two problems:
+---1. Scheduled callbacks are invoked less frequently than a libuv timer with a
+---   small interval (1-5ms). This causes those callbacks to fill up the queue
+---   in the event loop, and so the callback function may get invoked multiple
+---   times on one event loop tick. In contrast, Javascript's setInterval
+---   guarantees that the callback is not invoked more frequently than the
+---   interval.
+---2. When a libuv timer is stopped with vim.loop.timer_stop, it doesn't affect
+---   the callbacks that have already been scheduled. So timer_stop will not
+---   immediately stop the timer, the actual callback function will run one
+---   more time until it is finally stopped. This implementation ensures that
+---   timer_stop prevents any subsequent invocations of the callback.
+---
+---@class cmp_buffer.Timer
+---@field public handle any
+---@field private callback_wrapper_instance fun()|nil
+local timer = {}
+
+function timer.new()
+  local self = setmetatable({}, { __index = timer })
+  self.handle = vim.loop.new_timer()
+  self.callback_wrapper_instance = nil
+  return self
+end
+
+---@param timeout_ms number
+---@param repeat_ms number
+---@param callback fun()
+function timer:start(timeout_ms, repeat_ms, callback)
+  -- This is the flag that fixes problem 1.
+  local scheduled = false
+  -- Creating a function on every call to timer_start ensures that we can always
+  -- detect when a different callback is set by calling timer_start and prevent
+  -- the old one from being invoked.
+  local function callback_wrapper()
+    if scheduled then
+      return
+    end
+    scheduled = true
+    vim.schedule(function()
+      scheduled = false
+      -- Either a different callback was set, or the timer has been stopped.
+      if self.callback_wrapper_instance ~= callback_wrapper then
+        return
+      end
+      callback()
+    end)
+  end
+  self.handle:start(timeout_ms, repeat_ms, callback_wrapper)
+  self.callback_wrapper_instance = callback_wrapper
+end
+
+function timer:stop()
+  self.handle:stop()
+  self.callback_wrapper_instance = nil
+end
+
+function timer:is_active()
+  return self.handle:is_active()
+end
+
+function timer:close()
+  self.handle:close()
+end
+
+return timer