From 3143b0fb9f57b6634538d39781e134b01796e00d Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Sun, 7 Nov 2021 18:41:16 +0200 Subject: [PATCH 1/8] Add options for tweaking indexing speed and enabling synchronous mode --- README.md | 52 +++++++++++++++++++++++++++ lua/cmp_buffer/buffer.lua | 75 ++++++++++++++++++++++++++------------- lua/cmp_buffer/source.lua | 6 ++++ 3 files changed, 109 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 52d87e9..26025fc 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,58 @@ end ``` +### indexing_interval (type: number) + +_Default:_ `200` + +The rate (in milliseconds) at which buffers are scanned for words when they are first opened. +Setting this interval to lower values will increase the speed of indexing, but at the expense of +higher CPU usage. By default indexing happens asynchronously, but setting this option to zero or +a negative value will switch indexing to a synchronous algorithm, which uses significantly less +RAM on big files and takes less time in total (to index the entire file), with the obvious +downside of blocking the user interface for a second or two. On small files (up to tens of +thousands of lines, probably) the difference will be unnoticeable, though. + + +### indexing_chunk_size (type: number) + +_Default:_ `1000` + +The number of lines processed in batch every `indexing_interval` milliseconds. Setting it to +higher values will make indexing faster, but at the cost of responsiveness of the UI. When using +the synchronous mode, changing this option may improve memory usage, though the default value has +been tested to be pretty good in this regard. + +Please note that the `indexing_interval` and `indexing_chunk_size` are advanced options, change +them only if you experience performance or RAM usage problems (or need to work on particularly +large files) and be sure to measure the results! + + +## Performance on large text files + +This source has been tested on code files of a few megabytes in size (5-10) and it has been +optimized for them, however, the indexed words can still take up tens of megabytes of RAM if the +file is big (on small files it _will not be more_ than a couple of megabytes, typically much +less). So if you wish to avoid accidentally wasting lots of RAM when editing big files, you can +tweak `get_bufnrs`, for example like this: + +```lua +get_bufnrs = function() + local buf = vim.api.nvim_get_current_buf() + local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf)) + if byte_size > 1024 * 1024 then -- 1 Megabyte max + return {} + end + return { buf } +end +``` + +Of course, this snippet can be combined with any other recipes for `get_bufnrs`. + +As another tip, turning on the synchronous indexing mode is very likely to help with reducing +memory usage, see the `indexing_interval` option. + + ## Locality bonus comparator (distance-based sorting) This source also provides a comparator function which uses information from the word indexer diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua index 4a44e32..777a89b 100644 --- a/lua/cmp_buffer/buffer.lua +++ b/lua/cmp_buffer/buffer.lua @@ -2,8 +2,6 @@ ---@field public bufnr number ---@field public opts cmp_buffer.Options ---@field public regex any ----@field public indexing_chunk_size number ----@field public indexing_interval number ---@field public timer any|nil ---@field public lines_count number ---@field public lines_words table @@ -34,8 +32,6 @@ function buffer.new(bufnr, opts) self.opts = opts self.regex = vim.regex(self.opts.keyword_pattern) - self.indexing_chunk_size = 1000 - self.indexing_interval = 200 self.lines_count = 0 self.lines_words = {} @@ -96,11 +92,18 @@ end ---Indexing buffer function buffer.index(self) self.lines_count = vim.api.nvim_buf_line_count(self.bufnr) - for i = 1, self.lines_count do - self.lines_words[i] = {} - end + -- NOTE: Pre-allocating self.lines_words here somehow wastes more memory, and + -- not doing that doesn't have a visible effect on performance. Win-win. + -- for i = 1, self.lines_count do + -- self.lines_words[i] = {} + -- end - self:index_range_async(0, self.lines_count) + if self.opts.indexing_interval <= 0 then + self:index_range(0, self.lines_count, self.opts.indexing_chunk_size) + self:mark_all_lines_dirty() + else + self:index_range_async(0, self.lines_count, self.opts.indexing_chunk_size) + end end --- Workaround for https://github.com/neovim/neovim/issues/16729 @@ -112,30 +115,52 @@ function buffer.safe_buf_call(self, callback) end end -function buffer.index_range(self, range_start, range_end) +--- sync algorithm +function buffer.index_range(self, range_start, range_end, chunk_size) self:safe_buf_call(function() - local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true) - for i, line in ipairs(lines) do - self:index_line(range_start + i, line) + if chunk_size < 1 then + chunk_size = range_end - range_start + end + local chunk_start = range_start + while chunk_start < range_end do + local chunk_end = math.min(chunk_start + chunk_size, range_end) + -- For some reason requesting line arrays multiple times in chunks leads + -- to much better memory usage than doing that in one big array, which is + -- why the sync algorithm has better memory usage than the async one. + local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true) + for i, line in ipairs(chunk_lines) do + self:index_line(chunk_start + i, line) + end + chunk_start = chunk_end end end) end -function buffer.index_range_async(self, range_start, range_end) +--- async algorithm +function buffer.index_range_async(self, range_start, range_end, chunk_size) + if chunk_size < 1 then + chunk_size = range_end - range_start + end local chunk_start = range_start local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true) + -- This flag prevents vim.schedule() callbacks from piling up in the queue + -- when the indexing interval is very short. + local scheduled = false self.timer = vim.loop.new_timer() - self.timer:start( - 0, - self.indexing_interval, - vim.schedule_wrap(function() + self.timer:start(0, self.opts.indexing_interval, function() + if scheduled then + return + end + scheduled = true + vim.schedule(function() + scheduled = false if self.closed then return end - local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end) + local chunk_end = math.min(chunk_start + chunk_size, range_end) self:safe_buf_call(function() for linenr = chunk_start + 1, chunk_end do self:index_line(linenr, lines[linenr]) @@ -149,7 +174,7 @@ function buffer.index_range_async(self, range_start, range_end) self:stop_indexing_timer() end end) - ) + end) end --- watch @@ -219,7 +244,7 @@ function buffer.watch(self) self.lines_count = new_lines_count -- replace lines - self:index_range(first_line, new_last_line) + self:index_range(first_line, new_last_line, self.opts.indexing_chunk_size) if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then self.unique_words_curr_line_dirty = true @@ -242,9 +267,11 @@ function buffer.watch(self) -- because tables of all lines can be assumed to be fresh. local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr) if new_lines_count > self.lines_count then -- append - for i = self.lines_count + 1, new_lines_count do - self.lines_words[i] = {} - end + -- Again, no need to pre-allocate, index_line will append new lines + -- itself. + -- for i = self.lines_count + 1, new_lines_count do + -- self.lines_words[i] = {} + -- end elseif new_lines_count < self.lines_count then -- remove for i = self.lines_count, new_lines_count + 1, -1 do self.lines_words[i] = nil @@ -252,7 +279,7 @@ function buffer.watch(self) end self.lines_count = new_lines_count - self:index_range(0, self.lines_count) + self:index_range(0, self.lines_count, self.opts.indexing_chunk_size) self:mark_all_lines_dirty() self.words_distances_dirty = true end, diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua index f9cdc30..2bb7806 100644 --- a/lua/cmp_buffer/source.lua +++ b/lua/cmp_buffer/source.lua @@ -4,6 +4,8 @@ local buffer = require('cmp_buffer.buffer') ---@field public keyword_length number ---@field public keyword_pattern string ---@field public get_bufnrs fun(): number[] +---@field public indexing_chunk_size number +---@field public indexing_interval number ---@type cmp_buffer.Options local defaults = { @@ -12,6 +14,8 @@ local defaults = { get_bufnrs = function() return { vim.api.nvim_get_current_buf() } end, + indexing_chunk_size = 1000, + indexing_interval = 200, } local source = {} @@ -29,6 +33,8 @@ source._validate_options = function(_, params) keyword_length = { opts.keyword_length, 'number' }, keyword_pattern = { opts.keyword_pattern, 'string' }, get_bufnrs = { opts.get_bufnrs, 'function' }, + indexing_chunk_size = { opts.indexing_chunk_size, 'number' }, + indexing_interval = { opts.indexing_interval, 'number' }, }) return opts end From 6c7b786cb4844eb71c724dab51b1deddd573666c Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Sun, 14 Nov 2021 20:49:09 +0200 Subject: [PATCH 2/8] Make the async indexer resistant to user editing the buffer --- lua/cmp_buffer/buffer.lua | 69 +++++++++++++++++++++------------------ lua/cmp_buffer/source.lua | 2 +- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua index 777a89b..7ec338d 100644 --- a/lua/cmp_buffer/buffer.lua +++ b/lua/cmp_buffer/buffer.lua @@ -18,6 +18,10 @@ ---@field public words_distances_dirty boolean local buffer = {} +-- For some reason requesting this much lines multiple times in chunks leads to +-- much better memory usage than fetching the entire file in one go. +buffer.GET_LINES_CHUNK_SIZE = 1000 + ---Create new buffer object ---@param bufnr number ---@param opts cmp_buffer.Options @@ -98,11 +102,11 @@ function buffer.index(self) -- self.lines_words[i] = {} -- end - if self.opts.indexing_interval <= 0 then - self:index_range(0, self.lines_count, self.opts.indexing_chunk_size) + if self.opts.indexing_interval < 1 then + self:index_range(0, self.lines_count) self:mark_all_lines_dirty() else - self:index_range_async(0, self.lines_count, self.opts.indexing_chunk_size) + self:index_buffer_async() end end @@ -116,20 +120,17 @@ function buffer.safe_buf_call(self, callback) end --- sync algorithm -function buffer.index_range(self, range_start, range_end, chunk_size) +function buffer.index_range(self, range_start, range_end, skip_already_indexed) self:safe_buf_call(function() - if chunk_size < 1 then - chunk_size = range_end - range_start - end + local chunk_size = self.GET_LINES_CHUNK_SIZE local chunk_start = range_start while chunk_start < range_end do local chunk_end = math.min(chunk_start + chunk_size, range_end) - -- For some reason requesting line arrays multiple times in chunks leads - -- to much better memory usage than doing that in one big array, which is - -- why the sync algorithm has better memory usage than the async one. local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true) for i, line in ipairs(chunk_lines) do - self:index_line(chunk_start + i, line) + if not skip_already_indexed or not self.lines_words[chunk_start + i] then + self:index_line(chunk_start + i, line) + end end chunk_start = chunk_end end @@ -137,19 +138,17 @@ function buffer.index_range(self, range_start, range_end, chunk_size) end --- async algorithm -function buffer.index_range_async(self, range_start, range_end, chunk_size) - if chunk_size < 1 then - chunk_size = range_end - range_start - end - local chunk_start = range_start +function buffer.index_buffer_async(self) + local chunk_start = 0 - local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true) -- This flag prevents vim.schedule() callbacks from piling up in the queue -- when the indexing interval is very short. local scheduled = false - self.timer = vim.loop.new_timer() - self.timer:start(0, self.opts.indexing_interval, function() + -- Negative values result in an integer overflow in luv (vim.loop), and zero + -- disables timer repeat, so only intervals larger than 1 are valid. + local interval = math.max(1, self.opts.indexing_interval) + self.timer:start(0, interval, function() if scheduled then return end @@ -160,19 +159,27 @@ function buffer.index_range_async(self, range_start, range_end, chunk_size) return end - local chunk_end = math.min(chunk_start + chunk_size, range_end) - self:safe_buf_call(function() - for linenr = chunk_start + 1, chunk_end do - self:index_line(linenr, lines[linenr]) - end - end) + -- Note that the async indexer is designed to not break even if the user + -- is editing the file while it is in the process of being indexed. + -- Because the indexing in watcher must use the synchronous algorithm, we + -- assume that the data already present in self.lines_words to be correct + -- and doesn't need refreshing here because even if we do receive text + -- from nvim_buf_get_lines different from what the watcher has seen, it + -- will catch up on the next on_lines event. + + local line_count = vim.api.nvim_buf_line_count(self.bufnr) + -- Skip over the already indexed lines + while chunk_start < line_count and self.lines_words[chunk_start + 1] do + chunk_start = chunk_start + 1 + end + local chunk_end = math.min(chunk_start + self.opts.indexing_chunk_size, line_count) + if chunk_end >= line_count then + self:stop_indexing_timer() + end + self:index_range(chunk_start, chunk_end, true) chunk_start = chunk_end self:mark_all_lines_dirty() self.words_distances_dirty = true - - if chunk_end >= range_end then - self:stop_indexing_timer() - end end) end) end @@ -244,7 +251,7 @@ function buffer.watch(self) self.lines_count = new_lines_count -- replace lines - self:index_range(first_line, new_last_line, self.opts.indexing_chunk_size) + self:index_range(first_line, new_last_line) if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then self.unique_words_curr_line_dirty = true @@ -279,7 +286,7 @@ function buffer.watch(self) end self.lines_count = new_lines_count - self:index_range(0, self.lines_count, self.opts.indexing_chunk_size) + self:index_range(0, self.lines_count) self:mark_all_lines_dirty() self.words_distances_dirty = true end, diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua index 2bb7806..46c0593 100644 --- a/lua/cmp_buffer/source.lua +++ b/lua/cmp_buffer/source.lua @@ -15,7 +15,7 @@ local defaults = { return { vim.api.nvim_get_current_buf() } end, indexing_chunk_size = 1000, - indexing_interval = 200, + indexing_interval = 100, } local source = {} From a3ab9bec602dc310f1fb862da070d7419f84a6bd Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Sun, 19 Dec 2021 21:36:11 +0200 Subject: [PATCH 3/8] improve reliability of indexing while editing, make on_reload async --- lua/cmp_buffer/buffer.lua | 143 ++++++++++++++++++-------------------- lua/cmp_buffer/source.lua | 4 +- lua/cmp_buffer/timer.lua | 48 +++++++++++++ 3 files changed, 116 insertions(+), 79 deletions(-) create mode 100644 lua/cmp_buffer/timer.lua diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua index 7ec338d..8aacd61 100644 --- a/lua/cmp_buffer/buffer.lua +++ b/lua/cmp_buffer/buffer.lua @@ -1,9 +1,12 @@ +local timer = require('cmp_buffer.timer') + ---@class cmp_buffer.Buffer ---@field public bufnr number ---@field public opts cmp_buffer.Options ---@field public regex any ----@field public timer any|nil +---@field public timer cmp_buffer.Timer ---@field public lines_count number +---@field public timer_current_line number ---@field public lines_words table ---@field public unique_words_curr_line table ---@field public unique_words_other_lines table @@ -30,7 +33,7 @@ function buffer.new(bufnr, opts) local self = setmetatable({}, { __index = buffer }) self.bufnr = bufnr - self.timer = nil + self.timer = timer.new() self.closed = false self.on_close_cb = nil @@ -38,6 +41,7 @@ function buffer.new(bufnr, opts) self.regex = vim.regex(self.opts.keyword_pattern) self.lines_count = 0 + self.timer_current_line = -1 self.lines_words = {} self.unique_words_curr_line = {} @@ -58,8 +62,11 @@ end function buffer.close(self) self.closed = true self:stop_indexing_timer() + self.timer:close() + self.timer = nil self.lines_count = 0 + self.timer_current_line = -1 self.lines_words = {} self.unique_words_curr_line = {} @@ -79,11 +86,8 @@ function buffer.close(self) end function buffer.stop_indexing_timer(self) - if self.timer and not self.timer:is_closing() then - self.timer:stop() - self.timer:close() - end - self.timer = nil + self.timer:stop() + self.timer_current_line = -1 end function buffer.mark_all_lines_dirty(self) @@ -91,23 +95,7 @@ function buffer.mark_all_lines_dirty(self) self.unique_words_other_lines_dirty = true self.last_edit_first_line = 0 self.last_edit_last_line = 0 -end - ----Indexing buffer -function buffer.index(self) - self.lines_count = vim.api.nvim_buf_line_count(self.bufnr) - -- NOTE: Pre-allocating self.lines_words here somehow wastes more memory, and - -- not doing that doesn't have a visible effect on performance. Win-win. - -- for i = 1, self.lines_count do - -- self.lines_words[i] = {} - -- end - - if self.opts.indexing_interval < 1 then - self:index_range(0, self.lines_count) - self:mark_all_lines_dirty() - else - self:index_buffer_async() - end + self.words_distances_dirty = true end --- Workaround for https://github.com/neovim/neovim/issues/16729 @@ -119,7 +107,6 @@ function buffer.safe_buf_call(self, callback) end end ---- sync algorithm function buffer.index_range(self, range_start, range_end, skip_already_indexed) self:safe_buf_call(function() local chunk_size = self.GET_LINES_CHUNK_SIZE @@ -137,55 +124,50 @@ function buffer.index_range(self, range_start, range_end, skip_already_indexed) end) end ---- async algorithm -function buffer.index_buffer_async(self) - local chunk_start = 0 +function buffer.start_indexing_timer(self) + self.lines_count = vim.api.nvim_buf_line_count(self.bufnr) + self.timer_current_line = 0 - -- This flag prevents vim.schedule() callbacks from piling up in the queue - -- when the indexing interval is very short. - local scheduled = false - self.timer = vim.loop.new_timer() -- Negative values result in an integer overflow in luv (vim.loop), and zero -- disables timer repeat, so only intervals larger than 1 are valid. local interval = math.max(1, self.opts.indexing_interval) self.timer:start(0, interval, function() - if scheduled then + if self.closed then + self:stop_indexing_timer() return end - scheduled = true - vim.schedule(function() - scheduled = false - if self.closed then - return - end - -- Note that the async indexer is designed to not break even if the user - -- is editing the file while it is in the process of being indexed. - -- Because the indexing in watcher must use the synchronous algorithm, we - -- assume that the data already present in self.lines_words to be correct - -- and doesn't need refreshing here because even if we do receive text - -- from nvim_buf_get_lines different from what the watcher has seen, it - -- will catch up on the next on_lines event. + -- Note that the async indexer is designed to not break even if the user is + -- editing the file while it is in the process of being indexed. Because + -- the indexing in watcher must use the synchronous algorithm, we assume + -- that the data already present in self.lines_words to be correct and + -- doesn't need refreshing here because even if we do receive text from + -- nvim_buf_get_lines different from what the watcher has seen so far, it + -- (the watcher) will catch up on the next on_lines event. - local line_count = vim.api.nvim_buf_line_count(self.bufnr) - -- Skip over the already indexed lines - while chunk_start < line_count and self.lines_words[chunk_start + 1] do - chunk_start = chunk_start + 1 - end - local chunk_end = math.min(chunk_start + self.opts.indexing_chunk_size, line_count) - if chunk_end >= line_count then - self:stop_indexing_timer() - end - self:index_range(chunk_start, chunk_end, true) - chunk_start = chunk_end - self:mark_all_lines_dirty() - self.words_distances_dirty = true - end) + -- Skip over the already indexed lines + while self.lines_words[self.timer_current_line + 1] do + self.timer_current_line = self.timer_current_line + 1 + end + + local chunk_start = self.timer_current_line + local chunk_size = self.opts.indexing_chunk_size + -- NOTE: self.lines_count may be modified by the indexer. + local chunk_end = chunk_size >= 1 and math.min(chunk_start + chunk_size, self.lines_count) or self.lines_count + if chunk_end >= self.lines_count then + self:stop_indexing_timer() + end + + self:index_range(chunk_start, chunk_end, true) + self.timer_current_line = chunk_end + self:mark_all_lines_dirty() end) end --- watch function buffer.watch(self) + self.lines_count = vim.api.nvim_buf_line_count(self.bufnr) + -- NOTE: As far as I know, indexing in watching can't be done asynchronously -- because even built-in commands generate multiple consequent `on_lines` -- events, and I'm not even mentioning plugins here. To get accurate results @@ -250,6 +232,25 @@ function buffer.watch(self) end self.lines_count = new_lines_count + -- This branch is support code for handling cases when the user is + -- editing the buffer while the async indexer is running. It solves the + -- problem that if new lines are inserted or old lines are deleted, the + -- indexes of each subsequent line will change, and so the indexer + -- current position must be adjusted to not accidentally skip any lines. + if self.timer:is_active() then + if first_line <= self.timer_current_line and self.timer_current_line < old_last_line then + -- The indexer was in the area of the current text edit. We will + -- synchronously index this area it in a moment, so the indexer + -- should resume from right after the edit range. + self.timer_current_line = new_last_line + elseif self.timer_current_line >= old_last_line then + -- The indexer was somewhere past the current text edit. This means + -- that the line numbers could have changed, and the indexing + -- position must be adjusted accordingly. + self.timer_current_line = self.timer_current_line + delta + end + end + -- replace lines self:index_range(first_line, new_last_line) @@ -270,25 +271,13 @@ function buffer.watch(self) return true end - -- The logic for adjusting lines list on buffer reloads is much simpler - -- because tables of all lines can be assumed to be fresh. - local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr) - if new_lines_count > self.lines_count then -- append - -- Again, no need to pre-allocate, index_line will append new lines - -- itself. - -- for i = self.lines_count + 1, new_lines_count do - -- self.lines_words[i] = {} - -- end - elseif new_lines_count < self.lines_count then -- remove - for i = self.lines_count, new_lines_count + 1, -1 do - self.lines_words[i] = nil - end + -- clear all lines + for i = self.lines_count, 1, -1 do + self.lines_words[i] = nil end - self.lines_count = new_lines_count - self:index_range(0, self.lines_count) - self:mark_all_lines_dirty() - self.words_distances_dirty = true + self:stop_indexing_timer() + self:start_indexing_timer() end, on_detach = function(_, _) diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua index 46c0593..ed1447d 100644 --- a/lua/cmp_buffer/source.lua +++ b/lua/cmp_buffer/source.lua @@ -50,7 +50,7 @@ source.complete = function(self, params, callback) local processing = false local bufs = self:_get_buffers(opts) for _, buf in ipairs(bufs) do - if buf.timer then + if buf.timer:is_active() then processing = true break end @@ -90,7 +90,7 @@ source._get_buffers = function(self, opts) new_buf.on_close_cb = function() self.buffers[bufnr] = nil end - new_buf:index() + new_buf:start_indexing_timer() new_buf:watch() self.buffers[bufnr] = new_buf end diff --git a/lua/cmp_buffer/timer.lua b/lua/cmp_buffer/timer.lua new file mode 100644 index 0000000..2d0b708 --- /dev/null +++ b/lua/cmp_buffer/timer.lua @@ -0,0 +1,48 @@ +---@class cmp_buffer.Timer +---@field public handle any +---@field private callback_wrapper_instance fun()|nil +local timer = {} + +function timer.new() + local self = setmetatable({}, { __index = timer }) + self.handle = vim.loop.new_timer() + self.callback_wrapper_instance = nil + return self +end + +---@param timeout_ms number +---@param repeat_ms number +---@param callback fun() +function timer:start(timeout_ms, repeat_ms, callback) + local scheduled = false + local function callback_wrapper() + if scheduled then + return + end + scheduled = true + vim.schedule(function() + scheduled = false + if self.callback_wrapper_instance ~= callback_wrapper then + return + end + callback() + end) + end + self.handle:start(timeout_ms, repeat_ms, callback_wrapper) + self.callback_wrapper_instance = callback_wrapper +end + +function timer:stop() + self.handle:stop() + self.callback_wrapper_instance = nil +end + +function timer:is_active() + return self.handle:is_active() +end + +function timer:close() + self.handle:close() +end + +return timer From c8daddb987ba2e295be3eae48c91aeaba6a92170 Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Mon, 20 Dec 2021 12:37:28 +0200 Subject: [PATCH 4/8] rename indexing_chunk_size to indexing_batch_size --- lua/cmp_buffer/buffer.lua | 12 ++++++------ lua/cmp_buffer/source.lua | 7 ++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua index 8aacd61..26444a7 100644 --- a/lua/cmp_buffer/buffer.lua +++ b/lua/cmp_buffer/buffer.lua @@ -150,16 +150,16 @@ function buffer.start_indexing_timer(self) self.timer_current_line = self.timer_current_line + 1 end - local chunk_start = self.timer_current_line - local chunk_size = self.opts.indexing_chunk_size + local batch_start = self.timer_current_line + local batch_size = self.opts.indexing_batch_size -- NOTE: self.lines_count may be modified by the indexer. - local chunk_end = chunk_size >= 1 and math.min(chunk_start + chunk_size, self.lines_count) or self.lines_count - if chunk_end >= self.lines_count then + local batch_end = batch_size >= 1 and math.min(batch_start + batch_size, self.lines_count) or self.lines_count + if batch_end >= self.lines_count then self:stop_indexing_timer() end - self:index_range(chunk_start, chunk_end, true) - self.timer_current_line = chunk_end + self:index_range(batch_start, batch_end, true) + self.timer_current_line = batch_end self:mark_all_lines_dirty() end) end diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua index ed1447d..df0ddbe 100644 --- a/lua/cmp_buffer/source.lua +++ b/lua/cmp_buffer/source.lua @@ -4,7 +4,7 @@ local buffer = require('cmp_buffer.buffer') ---@field public keyword_length number ---@field public keyword_pattern string ---@field public get_bufnrs fun(): number[] ----@field public indexing_chunk_size number +---@field public indexing_batch_size number ---@field public indexing_interval number ---@type cmp_buffer.Options @@ -14,7 +14,7 @@ local defaults = { get_bufnrs = function() return { vim.api.nvim_get_current_buf() } end, - indexing_chunk_size = 1000, + indexing_batch_size = 1000, indexing_interval = 100, } @@ -33,7 +33,7 @@ source._validate_options = function(_, params) keyword_length = { opts.keyword_length, 'number' }, keyword_pattern = { opts.keyword_pattern, 'string' }, get_bufnrs = { opts.get_bufnrs, 'function' }, - indexing_chunk_size = { opts.indexing_chunk_size, 'number' }, + indexing_batch_size = { opts.indexing_batch_size, 'number' }, indexing_interval = { opts.indexing_interval, 'number' }, }) return opts @@ -82,6 +82,7 @@ source.complete = function(self, params, callback) end ---@param opts cmp_buffer.Options +---@return cmp_buffer.Buffer[] source._get_buffers = function(self, opts) local buffers = {} for _, bufnr in ipairs(opts.get_bufnrs()) do From 0226d443a5332b1de12b2802883115b9c7f6b491 Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Mon, 20 Dec 2021 12:37:41 +0200 Subject: [PATCH 5/8] completely rewrite the documentation for the new options --- README.md | 97 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 26025fc..a08c1c9 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ The below source configuration are available. To set any of these options, do: ```lua cmp.setup({ sources = { - { + { name = 'buffer', option = { -- Options go into this table @@ -111,54 +111,16 @@ end ### indexing_interval (type: number) -_Default:_ `200` +_Default:_ `100` -The rate (in milliseconds) at which buffers are scanned for words when they are first opened. -Setting this interval to lower values will increase the speed of indexing, but at the expense of -higher CPU usage. By default indexing happens asynchronously, but setting this option to zero or -a negative value will switch indexing to a synchronous algorithm, which uses significantly less -RAM on big files and takes less time in total (to index the entire file), with the obvious -downside of blocking the user interface for a second or two. On small files (up to tens of -thousands of lines, probably) the difference will be unnoticeable, though. +Advanced option. See the section [Indexing](#indexing). -### indexing_chunk_size (type: number) +### indexing_batch_size (type: number) _Default:_ `1000` -The number of lines processed in batch every `indexing_interval` milliseconds. Setting it to -higher values will make indexing faster, but at the cost of responsiveness of the UI. When using -the synchronous mode, changing this option may improve memory usage, though the default value has -been tested to be pretty good in this regard. - -Please note that the `indexing_interval` and `indexing_chunk_size` are advanced options, change -them only if you experience performance or RAM usage problems (or need to work on particularly -large files) and be sure to measure the results! - - -## Performance on large text files - -This source has been tested on code files of a few megabytes in size (5-10) and it has been -optimized for them, however, the indexed words can still take up tens of megabytes of RAM if the -file is big (on small files it _will not be more_ than a couple of megabytes, typically much -less). So if you wish to avoid accidentally wasting lots of RAM when editing big files, you can -tweak `get_bufnrs`, for example like this: - -```lua -get_bufnrs = function() - local buf = vim.api.nvim_get_current_buf() - local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf)) - if byte_size > 1024 * 1024 then -- 1 Megabyte max - return {} - end - return { buf } -end -``` - -Of course, this snippet can be combined with any other recipes for `get_bufnrs`. - -As another tip, turning on the synchronous indexing mode is very likely to help with reducing -memory usage, see the `indexing_interval` option. +Advanced option. See the section [Indexing](#indexing). ## Locality bonus comparator (distance-based sorting) @@ -185,3 +147,52 @@ cmp.setup({ } }) ``` + + +## Indexing + +When a buffer is opened, this source first has to scan all lines in the buffer, match all words +and store all of their occurrences. This process is called _indexing_. When actually editing the +text in the buffer, the index of words is kept up-to-date with changes to the buffer's contents, +this is called _watching_. It is done by re-running the indexer on just the changed lines. +Indexing happens completely asynchronously in background, unlike watching, which must be performed +synchronously to ensure that the index of words is kept perfectly in-sync with the lines in the +buffer. However, most of the time this will not be a problem since many typical text edit +operations affect only one or two lines, unless you are pasting a 1000-line snippet. + +_Note that you can freely edit the buffer while it is being indexed_, the underlying algorithm is +written in such a way that your changes will not break the index or cause errors. If a crash does +happen - it is a bug, so please report it. + +The speed of indexing is configurable with two options: `indexing_interval` and +`indexing_batch_size`. Essentially, when indexing, a timer is started, which pulls a batch of +`indexing_batch_size` lines from the buffer, scans them for words, and repeats after +`indexing_interval` milliseconds. Decreasing interval and/or increasing the batch size will make +the indexer faster, but at the expense of higher CPU usage and more lag when editing the file +while indexing is still in progress. Setting `indexing_batch_size` to a negative value will switch +the indexer to the "synchronous" mode: this will process all lines in one go, take less time in +total (since no other code will be running on the Lua thread), but with the obvious downside that +the editor UI will be blocked. + +### Performance on large text files + +This source has been tested on code files of a few megabytes in size (5-10) and contains +optimizations for them, however, the indexed words can still take up tens of megabytes of RAM if +the file is large. It also currently has troubles on files with very long lines, see issue +[#13](https://github.com/hrsh7th/cmp-buffer/issues/13). + +So, if you wish to avoid accidentally running this source on big files, you can tweak +`get_bufnrs`, for example like this: + +```lua +get_bufnrs = function() + local buf = vim.api.nvim_get_current_buf() + local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf)) + if byte_size > 1024 * 1024 then -- 1 Megabyte max + return {} + end + return { buf } +end +``` + +Of course, this snippet can be combined with any other recipes for `get_bufnrs`. From 70713e1ca477cabbb1d974929add56f3e0556eff Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Mon, 20 Dec 2021 13:52:53 +0200 Subject: [PATCH 6/8] add comments to the new Timer class --- lua/cmp_buffer/timer.lua | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lua/cmp_buffer/timer.lua b/lua/cmp_buffer/timer.lua index 2d0b708..3ed83c7 100644 --- a/lua/cmp_buffer/timer.lua +++ b/lua/cmp_buffer/timer.lua @@ -1,3 +1,18 @@ +---This timer matches the semantics of setInterval and clearInterval of +---Javascript. It provides a more reliable alternative to vim.loop.timer_start +---with a callback wrapped into a vim.schedule call by addressing two problems: +---1. Scheduled callbacks are invoked less frequently than a libuv timer with a +--- small interval (1-5ms). This causes those callbacks to fill up the queue +--- in the event loop, and so the callback function may get invoked multiple +--- times on one event loop tick. In contrast, Javascript's setInterval +--- guarantees that the callback is not invoked more frequently than the +--- interval. +---2. When a libuv timer is stopped with vim.loop.timer_stop, it doesn't affect +--- the callbacks that have already been scheduled. So timer_stop will not +--- immediately stop the timer, the actual callback function will run one +--- more time until it is finally stopped. This implementation ensures that +--- timer_stop prevents any subsequent invocations of the callback. +--- ---@class cmp_buffer.Timer ---@field public handle any ---@field private callback_wrapper_instance fun()|nil @@ -14,7 +29,11 @@ end ---@param repeat_ms number ---@param callback fun() function timer:start(timeout_ms, repeat_ms, callback) + -- This is the flag that fixes problem 1. local scheduled = false + -- Creating a function on every call to timer_start ensures that we can always + -- detect when a different callback is set by calling timer_start and prevent + -- the old one from being invoked. local function callback_wrapper() if scheduled then return @@ -22,6 +41,7 @@ function timer:start(timeout_ms, repeat_ms, callback) scheduled = true vim.schedule(function() scheduled = false + -- Either a different callback was set, or the timer has been stopped. if self.callback_wrapper_instance ~= callback_wrapper then return end From 077d7de49a9d6244d67bd6a1e08206cda9f7aa0c Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Fri, 24 Dec 2021 14:49:15 +0200 Subject: [PATCH 7/8] use clear_table for clearing lines list in on_reload --- lua/cmp_buffer/buffer.lua | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua index 26444a7..fd94436 100644 --- a/lua/cmp_buffer/buffer.lua +++ b/lua/cmp_buffer/buffer.lua @@ -1,5 +1,11 @@ local timer = require('cmp_buffer.timer') +local function clear_table(tbl) + for k in pairs(tbl) do + tbl[k] = nil + end +end + ---@class cmp_buffer.Buffer ---@field public bufnr number ---@field public opts cmp_buffer.Options @@ -271,10 +277,7 @@ function buffer.watch(self) return true end - -- clear all lines - for i = self.lines_count, 1, -1 do - self.lines_words[i] = nil - end + clear_table(self.lines_words) self:stop_indexing_timer() self:start_indexing_timer() @@ -289,12 +292,6 @@ function buffer.watch(self) }) end -local function clear_table(tbl) - for k in pairs(tbl) do - tbl[k] = nil - end -end - ---@param linenr number ---@param line string function buffer.index_line(self, linenr, line) From eba65f6fdabf294fd598a8ff688d342ff330d5ea Mon Sep 17 00:00:00 2001 From: Dmytro Meleshko Date: Fri, 24 Dec 2021 14:53:02 +0200 Subject: [PATCH 8/8] place index_range calls as the last statement where it is called --- lua/cmp_buffer/buffer.lua | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua index fd94436..b530e64 100644 --- a/lua/cmp_buffer/buffer.lua +++ b/lua/cmp_buffer/buffer.lua @@ -163,10 +163,10 @@ function buffer.start_indexing_timer(self) if batch_end >= self.lines_count then self:stop_indexing_timer() end - - self:index_range(batch_start, batch_end, true) self.timer_current_line = batch_end self:mark_all_lines_dirty() + + self:index_range(batch_start, batch_end, true) end) end @@ -257,9 +257,6 @@ function buffer.watch(self) end end - -- replace lines - self:index_range(first_line, new_last_line) - if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then self.unique_words_curr_line_dirty = true else @@ -270,6 +267,9 @@ function buffer.watch(self) self.last_edit_last_line = new_last_line self.words_distances_dirty = true + + -- replace lines + self:index_range(first_line, new_last_line) end, on_reload = function(_, _)