Merge pull request #23 from dmitmel/synchronous-indexing

Improve reliability of async indexing while the user is editing the file, implement the memory usage optimization for the indexer, make its speed configurable
This commit is contained in:
hrsh7th 2021-12-24 23:30:44 +09:00 committed by GitHub
commit a01cfeca70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 235 additions and 77 deletions

View File

@ -19,7 +19,7 @@ The below source configuration are available. To set any of these options, do:
```lua
cmp.setup({
sources = {
{
{
name = 'buffer',
option = {
-- Options go into this table
@ -109,6 +109,20 @@ end
```
### indexing_interval (type: number)
_Default:_ `100`
Advanced option. See the section [Indexing](#indexing).
### indexing_batch_size (type: number)
_Default:_ `1000`
Advanced option. See the section [Indexing](#indexing).
## Locality bonus comparator (distance-based sorting)
This source also provides a comparator function which uses information from the word indexer
@ -133,3 +147,52 @@ cmp.setup({
}
})
```
## Indexing
When a buffer is opened, this source first has to scan all lines in the buffer, match all words
and store all of their occurrences. This process is called _indexing_. When actually editing the
text in the buffer, the index of words is kept up-to-date with changes to the buffer's contents,
this is called _watching_. It is done by re-running the indexer on just the changed lines.
Indexing happens completely asynchronously in background, unlike watching, which must be performed
synchronously to ensure that the index of words is kept perfectly in-sync with the lines in the
buffer. However, most of the time this will not be a problem since many typical text edit
operations affect only one or two lines, unless you are pasting a 1000-line snippet.
_Note that you can freely edit the buffer while it is being indexed_, the underlying algorithm is
written in such a way that your changes will not break the index or cause errors. If a crash does
happen - it is a bug, so please report it.
The speed of indexing is configurable with two options: `indexing_interval` and
`indexing_batch_size`. Essentially, when indexing, a timer is started, which pulls a batch of
`indexing_batch_size` lines from the buffer, scans them for words, and repeats after
`indexing_interval` milliseconds. Decreasing interval and/or increasing the batch size will make
the indexer faster, but at the expense of higher CPU usage and more lag when editing the file
while indexing is still in progress. Setting `indexing_batch_size` to a negative value will switch
the indexer to the "synchronous" mode: this will process all lines in one go, take less time in
total (since no other code will be running on the Lua thread), but with the obvious downside that
the editor UI will be blocked.
### Performance on large text files
This source has been tested on code files of a few megabytes in size (5-10) and contains
optimizations for them, however, the indexed words can still take up tens of megabytes of RAM if
the file is large. It also currently has troubles on files with very long lines, see issue
[#13](https://github.com/hrsh7th/cmp-buffer/issues/13).
So, if you wish to avoid accidentally running this source on big files, you can tweak
`get_bufnrs`, for example like this:
```lua
get_bufnrs = function()
local buf = vim.api.nvim_get_current_buf()
local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
if byte_size > 1024 * 1024 then -- 1 Megabyte max
return {}
end
return { buf }
end
```
Of course, this snippet can be combined with any other recipes for `get_bufnrs`.

View File

@ -1,11 +1,18 @@
local timer = require('cmp_buffer.timer')
local function clear_table(tbl)
for k in pairs(tbl) do
tbl[k] = nil
end
end
---@class cmp_buffer.Buffer
---@field public bufnr number
---@field public opts cmp_buffer.Options
---@field public regex any
---@field public indexing_chunk_size number
---@field public indexing_interval number
---@field public timer any|nil
---@field public timer cmp_buffer.Timer
---@field public lines_count number
---@field public timer_current_line number
---@field public lines_words table<number, string[]>
---@field public unique_words_curr_line table<string, boolean>
---@field public unique_words_other_lines table<string, boolean>
@ -20,6 +27,10 @@
---@field public words_distances_dirty boolean
local buffer = {}
-- For some reason requesting this much lines multiple times in chunks leads to
-- much better memory usage than fetching the entire file in one go.
buffer.GET_LINES_CHUNK_SIZE = 1000
---Create new buffer object
---@param bufnr number
---@param opts cmp_buffer.Options
@ -28,16 +39,15 @@ function buffer.new(bufnr, opts)
local self = setmetatable({}, { __index = buffer })
self.bufnr = bufnr
self.timer = nil
self.timer = timer.new()
self.closed = false
self.on_close_cb = nil
self.opts = opts
self.regex = vim.regex(self.opts.keyword_pattern)
self.indexing_chunk_size = 1000
self.indexing_interval = 200
self.lines_count = 0
self.timer_current_line = -1
self.lines_words = {}
self.unique_words_curr_line = {}
@ -58,8 +68,11 @@ end
function buffer.close(self)
self.closed = true
self:stop_indexing_timer()
self.timer:close()
self.timer = nil
self.lines_count = 0
self.timer_current_line = -1
self.lines_words = {}
self.unique_words_curr_line = {}
@ -79,11 +92,8 @@ function buffer.close(self)
end
function buffer.stop_indexing_timer(self)
if self.timer and not self.timer:is_closing() then
self.timer:stop()
self.timer:close()
end
self.timer = nil
self.timer:stop()
self.timer_current_line = -1
end
function buffer.mark_all_lines_dirty(self)
@ -91,16 +101,7 @@ function buffer.mark_all_lines_dirty(self)
self.unique_words_other_lines_dirty = true
self.last_edit_first_line = 0
self.last_edit_last_line = 0
end
---Indexing buffer
function buffer.index(self)
self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
for i = 1, self.lines_count do
self.lines_words[i] = {}
end
self:index_range_async(0, self.lines_count)
self.words_distances_dirty = true
end
--- Workaround for https://github.com/neovim/neovim/issues/16729
@ -112,48 +113,67 @@ function buffer.safe_buf_call(self, callback)
end
end
function buffer.index_range(self, range_start, range_end)
function buffer.index_range(self, range_start, range_end, skip_already_indexed)
self:safe_buf_call(function()
local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
for i, line in ipairs(lines) do
self:index_line(range_start + i, line)
local chunk_size = self.GET_LINES_CHUNK_SIZE
local chunk_start = range_start
while chunk_start < range_end do
local chunk_end = math.min(chunk_start + chunk_size, range_end)
local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
for i, line in ipairs(chunk_lines) do
if not skip_already_indexed or not self.lines_words[chunk_start + i] then
self:index_line(chunk_start + i, line)
end
end
chunk_start = chunk_end
end
end)
end
function buffer.index_range_async(self, range_start, range_end)
local chunk_start = range_start
function buffer.start_indexing_timer(self)
self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
self.timer_current_line = 0
local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
-- Negative values result in an integer overflow in luv (vim.loop), and zero
-- disables timer repeat, so only intervals larger than 1 are valid.
local interval = math.max(1, self.opts.indexing_interval)
self.timer:start(0, interval, function()
if self.closed then
self:stop_indexing_timer()
return
end
self.timer = vim.loop.new_timer()
self.timer:start(
0,
self.indexing_interval,
vim.schedule_wrap(function()
if self.closed then
return
end
-- Note that the async indexer is designed to not break even if the user is
-- editing the file while it is in the process of being indexed. Because
-- the indexing in watcher must use the synchronous algorithm, we assume
-- that the data already present in self.lines_words to be correct and
-- doesn't need refreshing here because even if we do receive text from
-- nvim_buf_get_lines different from what the watcher has seen so far, it
-- (the watcher) will catch up on the next on_lines event.
local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end)
self:safe_buf_call(function()
for linenr = chunk_start + 1, chunk_end do
self:index_line(linenr, lines[linenr])
end
end)
chunk_start = chunk_end
self:mark_all_lines_dirty()
self.words_distances_dirty = true
-- Skip over the already indexed lines
while self.lines_words[self.timer_current_line + 1] do
self.timer_current_line = self.timer_current_line + 1
end
if chunk_end >= range_end then
self:stop_indexing_timer()
end
end)
)
local batch_start = self.timer_current_line
local batch_size = self.opts.indexing_batch_size
-- NOTE: self.lines_count may be modified by the indexer.
local batch_end = batch_size >= 1 and math.min(batch_start + batch_size, self.lines_count) or self.lines_count
if batch_end >= self.lines_count then
self:stop_indexing_timer()
end
self.timer_current_line = batch_end
self:mark_all_lines_dirty()
self:index_range(batch_start, batch_end, true)
end)
end
--- watch
function buffer.watch(self)
self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-- NOTE: As far as I know, indexing in watching can't be done asynchronously
-- because even built-in commands generate multiple consequent `on_lines`
-- events, and I'm not even mentioning plugins here. To get accurate results
@ -218,8 +238,24 @@ function buffer.watch(self)
end
self.lines_count = new_lines_count
-- replace lines
self:index_range(first_line, new_last_line)
-- This branch is support code for handling cases when the user is
-- editing the buffer while the async indexer is running. It solves the
-- problem that if new lines are inserted or old lines are deleted, the
-- indexes of each subsequent line will change, and so the indexer
-- current position must be adjusted to not accidentally skip any lines.
if self.timer:is_active() then
if first_line <= self.timer_current_line and self.timer_current_line < old_last_line then
-- The indexer was in the area of the current text edit. We will
-- synchronously index this area it in a moment, so the indexer
-- should resume from right after the edit range.
self.timer_current_line = new_last_line
elseif self.timer_current_line >= old_last_line then
-- The indexer was somewhere past the current text edit. This means
-- that the line numbers could have changed, and the indexing
-- position must be adjusted accordingly.
self.timer_current_line = self.timer_current_line + delta
end
end
if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
self.unique_words_curr_line_dirty = true
@ -231,6 +267,9 @@ function buffer.watch(self)
self.last_edit_last_line = new_last_line
self.words_distances_dirty = true
-- replace lines
self:index_range(first_line, new_last_line)
end,
on_reload = function(_, _)
@ -238,23 +277,10 @@ function buffer.watch(self)
return true
end
-- The logic for adjusting lines list on buffer reloads is much simpler
-- because tables of all lines can be assumed to be fresh.
local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
if new_lines_count > self.lines_count then -- append
for i = self.lines_count + 1, new_lines_count do
self.lines_words[i] = {}
end
elseif new_lines_count < self.lines_count then -- remove
for i = self.lines_count, new_lines_count + 1, -1 do
self.lines_words[i] = nil
end
end
self.lines_count = new_lines_count
clear_table(self.lines_words)
self:index_range(0, self.lines_count)
self:mark_all_lines_dirty()
self.words_distances_dirty = true
self:stop_indexing_timer()
self:start_indexing_timer()
end,
on_detach = function(_, _)
@ -266,12 +292,6 @@ function buffer.watch(self)
})
end
local function clear_table(tbl)
for k in pairs(tbl) do
tbl[k] = nil
end
end
---@param linenr number
---@param line string
function buffer.index_line(self, linenr, line)

View File

@ -4,6 +4,8 @@ local buffer = require('cmp_buffer.buffer')
---@field public keyword_length number
---@field public keyword_pattern string
---@field public get_bufnrs fun(): number[]
---@field public indexing_batch_size number
---@field public indexing_interval number
---@type cmp_buffer.Options
local defaults = {
@ -12,6 +14,8 @@ local defaults = {
get_bufnrs = function()
return { vim.api.nvim_get_current_buf() }
end,
indexing_batch_size = 1000,
indexing_interval = 100,
}
local source = {}
@ -29,6 +33,8 @@ source._validate_options = function(_, params)
keyword_length = { opts.keyword_length, 'number' },
keyword_pattern = { opts.keyword_pattern, 'string' },
get_bufnrs = { opts.get_bufnrs, 'function' },
indexing_batch_size = { opts.indexing_batch_size, 'number' },
indexing_interval = { opts.indexing_interval, 'number' },
})
return opts
end
@ -44,7 +50,7 @@ source.complete = function(self, params, callback)
local processing = false
local bufs = self:_get_buffers(opts)
for _, buf in ipairs(bufs) do
if buf.timer then
if buf.timer:is_active() then
processing = true
break
end
@ -76,6 +82,7 @@ source.complete = function(self, params, callback)
end
---@param opts cmp_buffer.Options
---@return cmp_buffer.Buffer[]
source._get_buffers = function(self, opts)
local buffers = {}
for _, bufnr in ipairs(opts.get_bufnrs()) do
@ -84,7 +91,7 @@ source._get_buffers = function(self, opts)
new_buf.on_close_cb = function()
self.buffers[bufnr] = nil
end
new_buf:index()
new_buf:start_indexing_timer()
new_buf:watch()
self.buffers[bufnr] = new_buf
end

68
lua/cmp_buffer/timer.lua Normal file
View File

@ -0,0 +1,68 @@
---This timer matches the semantics of setInterval and clearInterval of
---Javascript. It provides a more reliable alternative to vim.loop.timer_start
---with a callback wrapped into a vim.schedule call by addressing two problems:
---1. Scheduled callbacks are invoked less frequently than a libuv timer with a
--- small interval (1-5ms). This causes those callbacks to fill up the queue
--- in the event loop, and so the callback function may get invoked multiple
--- times on one event loop tick. In contrast, Javascript's setInterval
--- guarantees that the callback is not invoked more frequently than the
--- interval.
---2. When a libuv timer is stopped with vim.loop.timer_stop, it doesn't affect
--- the callbacks that have already been scheduled. So timer_stop will not
--- immediately stop the timer, the actual callback function will run one
--- more time until it is finally stopped. This implementation ensures that
--- timer_stop prevents any subsequent invocations of the callback.
---
---@class cmp_buffer.Timer
---@field public handle any
---@field private callback_wrapper_instance fun()|nil
local timer = {}
function timer.new()
local self = setmetatable({}, { __index = timer })
self.handle = vim.loop.new_timer()
self.callback_wrapper_instance = nil
return self
end
---@param timeout_ms number
---@param repeat_ms number
---@param callback fun()
function timer:start(timeout_ms, repeat_ms, callback)
-- This is the flag that fixes problem 1.
local scheduled = false
-- Creating a function on every call to timer_start ensures that we can always
-- detect when a different callback is set by calling timer_start and prevent
-- the old one from being invoked.
local function callback_wrapper()
if scheduled then
return
end
scheduled = true
vim.schedule(function()
scheduled = false
-- Either a different callback was set, or the timer has been stopped.
if self.callback_wrapper_instance ~= callback_wrapper then
return
end
callback()
end)
end
self.handle:start(timeout_ms, repeat_ms, callback_wrapper)
self.callback_wrapper_instance = callback_wrapper
end
function timer:stop()
self.handle:stop()
self.callback_wrapper_instance = nil
end
function timer:is_active()
return self.handle:is_active()
end
function timer:close()
self.handle:close()
end
return timer