Add options for tweaking indexing speed and enabling synchronous mode

2025-05-05 17:50:47 +00:00 · 2021-11-07 18:41:16 +02:00 · 2021-11-07 18:41:16 +02:00 · 3143b0fb9f
commit 3143b0fb9f
parent e26cdfb26f
3 changed files with 109 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -109,6 +109,58 @@ end
 ```


+### indexing_interval (type: number)
+
+_Default:_ `200`
+
+The rate (in milliseconds) at which buffers are scanned for words when they are first opened.
+Setting this interval to lower values will increase the speed of indexing, but at the expense of
+higher CPU usage. By default indexing happens asynchronously, but setting this option to zero or
+a negative value will switch indexing to a synchronous algorithm, which uses significantly less
+RAM on big files and takes less time in total (to index the entire file), with the obvious
+downside of blocking the user interface for a second or two. On small files (up to tens of
+thousands of lines, probably) the difference will be unnoticeable, though.
+
+
+### indexing_chunk_size (type: number)
+
+_Default:_ `1000`
+
+The number of lines processed in batch every `indexing_interval` milliseconds. Setting it to
+higher values will make indexing faster, but at the cost of responsiveness of the UI. When using
+the synchronous mode, changing this option may improve memory usage, though the default value has
+been tested to be pretty good in this regard.
+
+Please note that the `indexing_interval` and `indexing_chunk_size` are advanced options, change
+them only if you experience performance or RAM usage problems (or need to work on particularly
+large files) and be sure to measure the results!
+
+
+## Performance on large text files
+
+This source has been tested on code files of a few megabytes in size (5-10) and it has been
+optimized for them, however, the indexed words can still take up tens of megabytes of RAM if the
+file is big (on small files it _will not be more_ than a couple of megabytes, typically much
+less). So if you wish to avoid accidentally wasting lots of RAM when editing big files, you can
+tweak `get_bufnrs`, for example like this:
+
+```lua
+get_bufnrs = function()
+  local buf = vim.api.nvim_get_current_buf()
+  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
+  if byte_size > 1024 * 1024 then -- 1 Megabyte max
+    return {}
+  end
+  return { buf }
+end
+```
+
+Of course, this snippet can be combined with any other recipes for `get_bufnrs`.
+
+As another tip, turning on the synchronous indexing mode is very likely to help with reducing
+memory usage, see the `indexing_interval` option.
+
+
 ## Locality bonus comparator (distance-based sorting)

 This source also provides a comparator function which uses information from the word indexer
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@ -2,8 +2,6 @@
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
 ---@field public regex any
---@field public indexing_chunk_size number
---@field public indexing_interval number
 ---@field public timer any|nil
 ---@field public lines_count number
 ---@field public lines_words table<number, string[]>
@ -34,8 +32,6 @@ function buffer.new(bufnr, opts)

  self.opts = opts
  self.regex = vim.regex(self.opts.keyword_pattern)
-  self.indexing_chunk_size = 1000
-  self.indexing_interval = 200

  self.lines_count = 0
  self.lines_words = {}
@ -96,11 +92,18 @@ end
 ---Indexing buffer
 function buffer.index(self)
  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-  for i = 1, self.lines_count do
-    self.lines_words[i] = {}
-  end
+  -- NOTE: Pre-allocating self.lines_words here somehow wastes more memory, and
+  -- not doing that doesn't have a visible effect on performance. Win-win.
+  -- for i = 1, self.lines_count do
+  --   self.lines_words[i] = {}
+  -- end

-  self:index_range_async(0, self.lines_count)
+  if self.opts.indexing_interval <= 0 then
+    self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
+    self:mark_all_lines_dirty()
+  else
+    self:index_range_async(0, self.lines_count, self.opts.indexing_chunk_size)
+  end
 end

 --- Workaround for https://github.com/neovim/neovim/issues/16729
@ -112,30 +115,52 @@ function buffer.safe_buf_call(self, callback)
  end
 end

-function buffer.index_range(self, range_start, range_end)
+--- sync algorithm
+function buffer.index_range(self, range_start, range_end, chunk_size)
  self:safe_buf_call(function()
-    local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
-    for i, line in ipairs(lines) do
-      self:index_line(range_start + i, line)
+    if chunk_size < 1 then
+      chunk_size = range_end - range_start
+    end
+    local chunk_start = range_start
+    while chunk_start < range_end do
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
+      -- For some reason requesting line arrays multiple times in chunks leads
+      -- to much better memory usage than doing that in one big array, which is
+      -- why the sync algorithm has better memory usage than the async one.
+      local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
+      for i, line in ipairs(chunk_lines) do
+        self:index_line(chunk_start + i, line)
+      end
+      chunk_start = chunk_end
    end
  end)
 end

-function buffer.index_range_async(self, range_start, range_end)
+--- async algorithm
+function buffer.index_range_async(self, range_start, range_end, chunk_size)
+  if chunk_size < 1 then
+    chunk_size = range_end - range_start
+  end
  local chunk_start = range_start

  local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
+  -- This flag prevents vim.schedule() callbacks from piling up in the queue
+  -- when the indexing interval is very short.
+  local scheduled = false

  self.timer = vim.loop.new_timer()
-  self.timer:start(
-    0,
-    self.indexing_interval,
-    vim.schedule_wrap(function()
+  self.timer:start(0, self.opts.indexing_interval, function()
+    if scheduled then
+      return
+    end
+    scheduled = true
+    vim.schedule(function()
+      scheduled = false
      if self.closed then
        return
      end

-      local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end)
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
      self:safe_buf_call(function()
        for linenr = chunk_start + 1, chunk_end do
          self:index_line(linenr, lines[linenr])
@ -149,7 +174,7 @@ function buffer.index_range_async(self, range_start, range_end)
        self:stop_indexing_timer()
      end
    end)
-  )
+  end)
 end

 --- watch
@ -219,7 +244,7 @@ function buffer.watch(self)
      self.lines_count = new_lines_count

      -- replace lines
-      self:index_range(first_line, new_last_line)
+      self:index_range(first_line, new_last_line, self.opts.indexing_chunk_size)

      if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
        self.unique_words_curr_line_dirty = true
@ -242,9 +267,11 @@ function buffer.watch(self)
      -- because tables of all lines can be assumed to be fresh.
      local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
      if new_lines_count > self.lines_count then -- append
-        for i = self.lines_count + 1, new_lines_count do
-          self.lines_words[i] = {}
-        end
+        -- Again, no need to pre-allocate, index_line will append new lines
+        -- itself.
+        -- for i = self.lines_count + 1, new_lines_count do
+        --   self.lines_words[i] = {}
+        -- end
      elseif new_lines_count < self.lines_count then -- remove
        for i = self.lines_count, new_lines_count + 1, -1 do
          self.lines_words[i] = nil
@ -252,7 +279,7 @@ function buffer.watch(self)
      end
      self.lines_count = new_lines_count

-      self:index_range(0, self.lines_count)
+      self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
      self:mark_all_lines_dirty()
      self.words_distances_dirty = true
    end,
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@ -4,6 +4,8 @@ local buffer = require('cmp_buffer.buffer')
 ---@field public keyword_length number
 ---@field public keyword_pattern string
 ---@field public get_bufnrs fun(): number[]
+---@field public indexing_chunk_size number
+---@field public indexing_interval number

 ---@type cmp_buffer.Options
 local defaults = {
@ -12,6 +14,8 @@ local defaults = {
  get_bufnrs = function()
    return { vim.api.nvim_get_current_buf() }
  end,
+  indexing_chunk_size = 1000,
+  indexing_interval = 200,
 }

 local source = {}
@ -29,6 +33,8 @@ source._validate_options = function(_, params)
    keyword_length = { opts.keyword_length, 'number' },
    keyword_pattern = { opts.keyword_pattern, 'string' },
    get_bufnrs = { opts.get_bufnrs, 'function' },
+    indexing_chunk_size = { opts.indexing_chunk_size, 'number' },
+    indexing_interval = { opts.indexing_interval, 'number' },
  })
  return opts
 end