Add options for tweaking indexing speed and enabling synchronous mode

2025-04-23 23:56:20 +00:00 · 2021-11-07 18:41:16 +02:00 · 2021-11-07 18:41:16 +02:00 · 3143b0fb9f
commit 3143b0fb9f
parent e26cdfb26f
3 changed files with 109 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -109,6 +109,58 @@ end
 ```
 ### indexing_interval (type: number)
 _Default:_ `200`
 The rate (in milliseconds) at which buffers are scanned for words when they are first opened.
 Setting this interval to lower values will increase the speed of indexing, but at the expense of
 higher CPU usage. By default indexing happens asynchronously, but setting this option to zero or
 a negative value will switch indexing to a synchronous algorithm, which uses significantly less
 RAM on big files and takes less time in total (to index the entire file), with the obvious
 downside of blocking the user interface for a second or two. On small files (up to tens of
 thousands of lines, probably) the difference will be unnoticeable, though.
 ### indexing_chunk_size (type: number)
 _Default:_ `1000`
 The number of lines processed in batch every `indexing_interval` milliseconds. Setting it to
 higher values will make indexing faster, but at the cost of responsiveness of the UI. When using
 the synchronous mode, changing this option may improve memory usage, though the default value has
 been tested to be pretty good in this regard.
 Please note that the `indexing_interval` and `indexing_chunk_size` are advanced options, change
 them only if you experience performance or RAM usage problems (or need to work on particularly
 large files) and be sure to measure the results!
 ## Performance on large text files
 This source has been tested on code files of a few megabytes in size (5-10) and it has been
 optimized for them, however, the indexed words can still take up tens of megabytes of RAM if the
 file is big (on small files it _will not be more_ than a couple of megabytes, typically much
 less). So if you wish to avoid accidentally wasting lots of RAM when editing big files, you can
 tweak `get_bufnrs`, for example like this:
 ```lua
 get_bufnrs = function()
  local buf = vim.api.nvim_get_current_buf()
  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
  if byte_size > 1024 * 1024 then -- 1 Megabyte max
    return {}
  end
  return { buf }
 end
 ```
 Of course, this snippet can be combined with any other recipes for `get_bufnrs`.
 As another tip, turning on the synchronous indexing mode is very likely to help with reducing
 memory usage, see the `indexing_interval` option.
 ## Locality bonus comparator (distance-based sorting)
 This source also provides a comparator function which uses information from the word indexer
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@ -2,8 +2,6 @@
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
 ---@field public regex any
 ---@field public indexing_chunk_size number
 ---@field public indexing_interval number
 ---@field public timer any|nil
 ---@field public lines_count number
 ---@field public lines_words table<number, string[]>
@ -34,8 +32,6 @@ function buffer.new(bufnr, opts)
  self.opts = opts
  self.regex = vim.regex(self.opts.keyword_pattern)
  self.indexing_chunk_size = 1000
  self.indexing_interval = 200
  self.lines_count = 0
  self.lines_words = {}
@ -96,11 +92,18 @@ end
 ---Indexing buffer
 function buffer.index(self)
  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-  for i = 1, self.lines_count do
+  -- NOTE: Pre-allocating self.lines_words here somehow wastes more memory, and
-    self.lines_words[i] = {}
+  -- not doing that doesn't have a visible effect on performance. Win-win.
-  end
+  -- for i = 1, self.lines_count do
  --   self.lines_words[i] = {}
  -- end
-  self:index_range_async(0, self.lines_count)
+  if self.opts.indexing_interval <= 0 then
    self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
    self:mark_all_lines_dirty()
  else
    self:index_range_async(0, self.lines_count, self.opts.indexing_chunk_size)
  end
 end
 --- Workaround for https://github.com/neovim/neovim/issues/16729
@ -112,30 +115,52 @@ function buffer.safe_buf_call(self, callback)
  end
 end
-function buffer.index_range(self, range_start, range_end)
+--- sync algorithm
 function buffer.index_range(self, range_start, range_end, chunk_size)
  self:safe_buf_call(function()
-    local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
+    if chunk_size < 1 then
-    for i, line in ipairs(lines) do
+      chunk_size = range_end - range_start
-      self:index_line(range_start + i, line)
+    end
    local chunk_start = range_start
    while chunk_start < range_end do
      local chunk_end = math.min(chunk_start + chunk_size, range_end)
      -- For some reason requesting line arrays multiple times in chunks leads
      -- to much better memory usage than doing that in one big array, which is
      -- why the sync algorithm has better memory usage than the async one.
      local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
      for i, line in ipairs(chunk_lines) do
        self:index_line(chunk_start + i, line)
      end
      chunk_start = chunk_end
    end
  end)
 end
-function buffer.index_range_async(self, range_start, range_end)
+--- async algorithm
 function buffer.index_range_async(self, range_start, range_end, chunk_size)
  if chunk_size < 1 then
    chunk_size = range_end - range_start
  end
  local chunk_start = range_start
  local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
  -- This flag prevents vim.schedule() callbacks from piling up in the queue
  -- when the indexing interval is very short.
  local scheduled = false
  self.timer = vim.loop.new_timer()
-  self.timer:start(
+  self.timer:start(0, self.opts.indexing_interval, function()
-    0,
+    if scheduled then
-    self.indexing_interval,
+      return
-    vim.schedule_wrap(function()
+    end
    scheduled = true
    vim.schedule(function()
      scheduled = false
      if self.closed then
        return
      end
-      local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end)
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
      self:safe_buf_call(function()
        for linenr = chunk_start + 1, chunk_end do
          self:index_line(linenr, lines[linenr])
@ -149,7 +174,7 @@ function buffer.index_range_async(self, range_start, range_end)
        self:stop_indexing_timer()
      end
    end)
-  )
+  end)
 end
 --- watch
@ -219,7 +244,7 @@ function buffer.watch(self)
      self.lines_count = new_lines_count
      -- replace lines
-      self:index_range(first_line, new_last_line)
+      self:index_range(first_line, new_last_line, self.opts.indexing_chunk_size)
      if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
        self.unique_words_curr_line_dirty = true
@ -242,9 +267,11 @@ function buffer.watch(self)
      -- because tables of all lines can be assumed to be fresh.
      local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
      if new_lines_count > self.lines_count then -- append
-        for i = self.lines_count + 1, new_lines_count do
+        -- Again, no need to pre-allocate, index_line will append new lines
-          self.lines_words[i] = {}
+        -- itself.
-        end
+        -- for i = self.lines_count + 1, new_lines_count do
        --   self.lines_words[i] = {}
        -- end
      elseif new_lines_count < self.lines_count then -- remove
        for i = self.lines_count, new_lines_count + 1, -1 do
          self.lines_words[i] = nil
@ -252,7 +279,7 @@ function buffer.watch(self)
      end
      self.lines_count = new_lines_count
-      self:index_range(0, self.lines_count)
+      self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
      self:mark_all_lines_dirty()
      self.words_distances_dirty = true
    end,
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@ -4,6 +4,8 @@ local buffer = require('cmp_buffer.buffer')
 ---@field public keyword_length number
 ---@field public keyword_pattern string
 ---@field public get_bufnrs fun(): number[]
 ---@field public indexing_chunk_size number
 ---@field public indexing_interval number
 ---@type cmp_buffer.Options
 local defaults = {
@ -12,6 +14,8 @@ local defaults = {
  get_bufnrs = function()
    return { vim.api.nvim_get_current_buf() }
  end,
  indexing_chunk_size = 1000,
  indexing_interval = 200,
 }
 local source = {}
@ -29,6 +33,8 @@ source._validate_options = function(_, params)
    keyword_length = { opts.keyword_length, 'number' },
    keyword_pattern = { opts.keyword_pattern, 'string' },
    get_bufnrs = { opts.get_bufnrs, 'function' },
    indexing_chunk_size = { opts.indexing_chunk_size, 'number' },
    indexing_interval = { opts.indexing_interval, 'number' },
  })
  return opts
 end