From 3143b0fb9f57b6634538d39781e134b01796e00d Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Sun, 7 Nov 2021 18:41:16 +0200
Subject: [PATCH 1/8] Add options for tweaking indexing speed and enabling
 synchronous mode

---
 README.md                 | 52 +++++++++++++++++++++++++++
 lua/cmp_buffer/buffer.lua | 75 ++++++++++++++++++++++++++-------------
 lua/cmp_buffer/source.lua |  6 ++++
 3 files changed, 109 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 52d87e9..26025fc 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,58 @@ end
 ```
 
 
+### indexing_interval (type: number)
+
+_Default:_ `200`
+
+The rate (in milliseconds) at which buffers are scanned for words when they are first opened.
+Setting this interval to lower values will increase the speed of indexing, but at the expense of
+higher CPU usage. By default indexing happens asynchronously, but setting this option to zero or
+a negative value will switch indexing to a synchronous algorithm, which uses significantly less
+RAM on big files and takes less time in total (to index the entire file), with the obvious
+downside of blocking the user interface for a second or two. On small files (up to tens of
+thousands of lines, probably) the difference will be unnoticeable, though.
+
+
+### indexing_chunk_size (type: number)
+
+_Default:_ `1000`
+
+The number of lines processed in batch every `indexing_interval` milliseconds. Setting it to
+higher values will make indexing faster, but at the cost of responsiveness of the UI. When using
+the synchronous mode, changing this option may improve memory usage, though the default value has
+been tested to be pretty good in this regard.
+
+Please note that the `indexing_interval` and `indexing_chunk_size` are advanced options, change
+them only if you experience performance or RAM usage problems (or need to work on particularly
+large files) and be sure to measure the results!
+
+
+## Performance on large text files
+
+This source has been tested on code files of a few megabytes in size (5-10) and it has been
+optimized for them, however, the indexed words can still take up tens of megabytes of RAM if the
+file is big (on small files it _will not be more_ than a couple of megabytes, typically much
+less). So if you wish to avoid accidentally wasting lots of RAM when editing big files, you can
+tweak `get_bufnrs`, for example like this:
+
+```lua
+get_bufnrs = function()
+  local buf = vim.api.nvim_get_current_buf()
+  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
+  if byte_size > 1024 * 1024 then -- 1 Megabyte max
+    return {}
+  end
+  return { buf }
+end
+```
+
+Of course, this snippet can be combined with any other recipes for `get_bufnrs`.
+
+As another tip, turning on the synchronous indexing mode is very likely to help with reducing
+memory usage, see the `indexing_interval` option.
+
+
 ## Locality bonus comparator (distance-based sorting)
 
 This source also provides a comparator function which uses information from the word indexer
diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
index 4a44e32..777a89b 100644
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@@ -2,8 +2,6 @@
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
 ---@field public regex any
----@field public indexing_chunk_size number
----@field public indexing_interval number
 ---@field public timer any|nil
 ---@field public lines_count number
 ---@field public lines_words table<number, string[]>
@@ -34,8 +32,6 @@ function buffer.new(bufnr, opts)
 
   self.opts = opts
   self.regex = vim.regex(self.opts.keyword_pattern)
-  self.indexing_chunk_size = 1000
-  self.indexing_interval = 200
 
   self.lines_count = 0
   self.lines_words = {}
@@ -96,11 +92,18 @@ end
 ---Indexing buffer
 function buffer.index(self)
   self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-  for i = 1, self.lines_count do
-    self.lines_words[i] = {}
-  end
+  -- NOTE: Pre-allocating self.lines_words here somehow wastes more memory, and
+  -- not doing that doesn't have a visible effect on performance. Win-win.
+  -- for i = 1, self.lines_count do
+  --   self.lines_words[i] = {}
+  -- end
 
-  self:index_range_async(0, self.lines_count)
+  if self.opts.indexing_interval <= 0 then
+    self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
+    self:mark_all_lines_dirty()
+  else
+    self:index_range_async(0, self.lines_count, self.opts.indexing_chunk_size)
+  end
 end
 
 --- Workaround for https://github.com/neovim/neovim/issues/16729
@@ -112,30 +115,52 @@ function buffer.safe_buf_call(self, callback)
   end
 end
 
-function buffer.index_range(self, range_start, range_end)
+--- sync algorithm
+function buffer.index_range(self, range_start, range_end, chunk_size)
   self:safe_buf_call(function()
-    local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
-    for i, line in ipairs(lines) do
-      self:index_line(range_start + i, line)
+    if chunk_size < 1 then
+      chunk_size = range_end - range_start
+    end
+    local chunk_start = range_start
+    while chunk_start < range_end do
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
+      -- For some reason requesting line arrays multiple times in chunks leads
+      -- to much better memory usage than doing that in one big array, which is
+      -- why the sync algorithm has better memory usage than the async one.
+      local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
+      for i, line in ipairs(chunk_lines) do
+        self:index_line(chunk_start + i, line)
+      end
+      chunk_start = chunk_end
     end
   end)
 end
 
-function buffer.index_range_async(self, range_start, range_end)
+--- async algorithm
+function buffer.index_range_async(self, range_start, range_end, chunk_size)
+  if chunk_size < 1 then
+    chunk_size = range_end - range_start
+  end
   local chunk_start = range_start
 
   local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
+  -- This flag prevents vim.schedule() callbacks from piling up in the queue
+  -- when the indexing interval is very short.
+  local scheduled = false
 
   self.timer = vim.loop.new_timer()
-  self.timer:start(
-    0,
-    self.indexing_interval,
-    vim.schedule_wrap(function()
+  self.timer:start(0, self.opts.indexing_interval, function()
+    if scheduled then
+      return
+    end
+    scheduled = true
+    vim.schedule(function()
+      scheduled = false
       if self.closed then
         return
       end
 
-      local chunk_end = math.min(chunk_start + self.indexing_chunk_size, range_end)
+      local chunk_end = math.min(chunk_start + chunk_size, range_end)
       self:safe_buf_call(function()
         for linenr = chunk_start + 1, chunk_end do
           self:index_line(linenr, lines[linenr])
@@ -149,7 +174,7 @@ function buffer.index_range_async(self, range_start, range_end)
         self:stop_indexing_timer()
       end
     end)
-  )
+  end)
 end
 
 --- watch
@@ -219,7 +244,7 @@ function buffer.watch(self)
       self.lines_count = new_lines_count
 
       -- replace lines
-      self:index_range(first_line, new_last_line)
+      self:index_range(first_line, new_last_line, self.opts.indexing_chunk_size)
 
       if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
         self.unique_words_curr_line_dirty = true
@@ -242,9 +267,11 @@ function buffer.watch(self)
       -- because tables of all lines can be assumed to be fresh.
       local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
       if new_lines_count > self.lines_count then -- append
-        for i = self.lines_count + 1, new_lines_count do
-          self.lines_words[i] = {}
-        end
+        -- Again, no need to pre-allocate, index_line will append new lines
+        -- itself.
+        -- for i = self.lines_count + 1, new_lines_count do
+        --   self.lines_words[i] = {}
+        -- end
       elseif new_lines_count < self.lines_count then -- remove
         for i = self.lines_count, new_lines_count + 1, -1 do
           self.lines_words[i] = nil
@@ -252,7 +279,7 @@ function buffer.watch(self)
       end
       self.lines_count = new_lines_count
 
-      self:index_range(0, self.lines_count)
+      self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
       self:mark_all_lines_dirty()
       self.words_distances_dirty = true
     end,
diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua
index f9cdc30..2bb7806 100644
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@@ -4,6 +4,8 @@ local buffer = require('cmp_buffer.buffer')
 ---@field public keyword_length number
 ---@field public keyword_pattern string
 ---@field public get_bufnrs fun(): number[]
+---@field public indexing_chunk_size number
+---@field public indexing_interval number
 
 ---@type cmp_buffer.Options
 local defaults = {
@@ -12,6 +14,8 @@ local defaults = {
   get_bufnrs = function()
     return { vim.api.nvim_get_current_buf() }
   end,
+  indexing_chunk_size = 1000,
+  indexing_interval = 200,
 }
 
 local source = {}
@@ -29,6 +33,8 @@ source._validate_options = function(_, params)
     keyword_length = { opts.keyword_length, 'number' },
     keyword_pattern = { opts.keyword_pattern, 'string' },
     get_bufnrs = { opts.get_bufnrs, 'function' },
+    indexing_chunk_size = { opts.indexing_chunk_size, 'number' },
+    indexing_interval = { opts.indexing_interval, 'number' },
   })
   return opts
 end

From 6c7b786cb4844eb71c724dab51b1deddd573666c Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Sun, 14 Nov 2021 20:49:09 +0200
Subject: [PATCH 2/8] Make the async indexer resistant to user editing the
 buffer

---
 lua/cmp_buffer/buffer.lua | 69 +++++++++++++++++++++------------------
 lua/cmp_buffer/source.lua |  2 +-
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
index 777a89b..7ec338d 100644
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@@ -18,6 +18,10 @@
 ---@field public words_distances_dirty boolean
 local buffer = {}
 
+-- For some reason requesting this much lines multiple times in chunks leads to
+-- much better memory usage than fetching the entire file in one go.
+buffer.GET_LINES_CHUNK_SIZE = 1000
+
 ---Create new buffer object
 ---@param bufnr number
 ---@param opts cmp_buffer.Options
@@ -98,11 +102,11 @@ function buffer.index(self)
   --   self.lines_words[i] = {}
   -- end
 
-  if self.opts.indexing_interval <= 0 then
-    self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
+  if self.opts.indexing_interval < 1 then
+    self:index_range(0, self.lines_count)
     self:mark_all_lines_dirty()
   else
-    self:index_range_async(0, self.lines_count, self.opts.indexing_chunk_size)
+    self:index_buffer_async()
   end
 end
 
@@ -116,20 +120,17 @@ function buffer.safe_buf_call(self, callback)
 end
 
 --- sync algorithm
-function buffer.index_range(self, range_start, range_end, chunk_size)
+function buffer.index_range(self, range_start, range_end, skip_already_indexed)
   self:safe_buf_call(function()
-    if chunk_size < 1 then
-      chunk_size = range_end - range_start
-    end
+    local chunk_size = self.GET_LINES_CHUNK_SIZE
     local chunk_start = range_start
     while chunk_start < range_end do
       local chunk_end = math.min(chunk_start + chunk_size, range_end)
-      -- For some reason requesting line arrays multiple times in chunks leads
-      -- to much better memory usage than doing that in one big array, which is
-      -- why the sync algorithm has better memory usage than the async one.
       local chunk_lines = vim.api.nvim_buf_get_lines(self.bufnr, chunk_start, chunk_end, true)
       for i, line in ipairs(chunk_lines) do
-        self:index_line(chunk_start + i, line)
+        if not skip_already_indexed or not self.lines_words[chunk_start + i] then
+          self:index_line(chunk_start + i, line)
+        end
       end
       chunk_start = chunk_end
     end
@@ -137,19 +138,17 @@ function buffer.index_range(self, range_start, range_end, chunk_size)
 end
 
 --- async algorithm
-function buffer.index_range_async(self, range_start, range_end, chunk_size)
-  if chunk_size < 1 then
-    chunk_size = range_end - range_start
-  end
-  local chunk_start = range_start
+function buffer.index_buffer_async(self)
+  local chunk_start = 0
 
-  local lines = vim.api.nvim_buf_get_lines(self.bufnr, range_start, range_end, true)
   -- This flag prevents vim.schedule() callbacks from piling up in the queue
   -- when the indexing interval is very short.
   local scheduled = false
-
   self.timer = vim.loop.new_timer()
-  self.timer:start(0, self.opts.indexing_interval, function()
+  -- Negative values result in an integer overflow in luv (vim.loop), and zero
+  -- disables timer repeat, so only intervals larger than 1 are valid.
+  local interval = math.max(1, self.opts.indexing_interval)
+  self.timer:start(0, interval, function()
     if scheduled then
       return
     end
@@ -160,19 +159,27 @@ function buffer.index_range_async(self, range_start, range_end, chunk_size)
         return
       end
 
-      local chunk_end = math.min(chunk_start + chunk_size, range_end)
-      self:safe_buf_call(function()
-        for linenr = chunk_start + 1, chunk_end do
-          self:index_line(linenr, lines[linenr])
-        end
-      end)
+      -- Note that the async indexer is designed to not break even if the user
+      -- is editing the file while it is in the process of being indexed.
+      -- Because the indexing in watcher must use the synchronous algorithm, we
+      -- assume that the data already present in self.lines_words to be correct
+      -- and doesn't need refreshing here because even if we do receive text
+      -- from nvim_buf_get_lines different from what the watcher has seen, it
+      -- will catch up on the next on_lines event.
+
+      local line_count = vim.api.nvim_buf_line_count(self.bufnr)
+      -- Skip over the already indexed lines
+      while chunk_start < line_count and self.lines_words[chunk_start + 1] do
+        chunk_start = chunk_start + 1
+      end
+      local chunk_end = math.min(chunk_start + self.opts.indexing_chunk_size, line_count)
+      if chunk_end >= line_count then
+        self:stop_indexing_timer()
+      end
+      self:index_range(chunk_start, chunk_end, true)
       chunk_start = chunk_end
       self:mark_all_lines_dirty()
       self.words_distances_dirty = true
-
-      if chunk_end >= range_end then
-        self:stop_indexing_timer()
-      end
     end)
   end)
 end
@@ -244,7 +251,7 @@ function buffer.watch(self)
       self.lines_count = new_lines_count
 
       -- replace lines
-      self:index_range(first_line, new_last_line, self.opts.indexing_chunk_size)
+      self:index_range(first_line, new_last_line)
 
       if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
         self.unique_words_curr_line_dirty = true
@@ -279,7 +286,7 @@ function buffer.watch(self)
       end
       self.lines_count = new_lines_count
 
-      self:index_range(0, self.lines_count, self.opts.indexing_chunk_size)
+      self:index_range(0, self.lines_count)
       self:mark_all_lines_dirty()
       self.words_distances_dirty = true
     end,
diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua
index 2bb7806..46c0593 100644
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@@ -15,7 +15,7 @@ local defaults = {
     return { vim.api.nvim_get_current_buf() }
   end,
   indexing_chunk_size = 1000,
-  indexing_interval = 200,
+  indexing_interval = 100,
 }
 
 local source = {}

From a3ab9bec602dc310f1fb862da070d7419f84a6bd Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Sun, 19 Dec 2021 21:36:11 +0200
Subject: [PATCH 3/8] improve reliability of indexing while editing, make
 on_reload async

---
 lua/cmp_buffer/buffer.lua | 143 ++++++++++++++++++--------------------
 lua/cmp_buffer/source.lua |   4 +-
 lua/cmp_buffer/timer.lua  |  48 +++++++++++++
 3 files changed, 116 insertions(+), 79 deletions(-)
 create mode 100644 lua/cmp_buffer/timer.lua

diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
index 7ec338d..8aacd61 100644
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@@ -1,9 +1,12 @@
+local timer = require('cmp_buffer.timer')
+
 ---@class cmp_buffer.Buffer
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
 ---@field public regex any
----@field public timer any|nil
+---@field public timer cmp_buffer.Timer
 ---@field public lines_count number
+---@field public timer_current_line number
 ---@field public lines_words table<number, string[]>
 ---@field public unique_words_curr_line table<string, boolean>
 ---@field public unique_words_other_lines table<string, boolean>
@@ -30,7 +33,7 @@ function buffer.new(bufnr, opts)
   local self = setmetatable({}, { __index = buffer })
 
   self.bufnr = bufnr
-  self.timer = nil
+  self.timer = timer.new()
   self.closed = false
   self.on_close_cb = nil
 
@@ -38,6 +41,7 @@ function buffer.new(bufnr, opts)
   self.regex = vim.regex(self.opts.keyword_pattern)
 
   self.lines_count = 0
+  self.timer_current_line = -1
   self.lines_words = {}
 
   self.unique_words_curr_line = {}
@@ -58,8 +62,11 @@ end
 function buffer.close(self)
   self.closed = true
   self:stop_indexing_timer()
+  self.timer:close()
+  self.timer = nil
 
   self.lines_count = 0
+  self.timer_current_line = -1
   self.lines_words = {}
 
   self.unique_words_curr_line = {}
@@ -79,11 +86,8 @@ function buffer.close(self)
 end
 
 function buffer.stop_indexing_timer(self)
-  if self.timer and not self.timer:is_closing() then
-    self.timer:stop()
-    self.timer:close()
-  end
-  self.timer = nil
+  self.timer:stop()
+  self.timer_current_line = -1
 end
 
 function buffer.mark_all_lines_dirty(self)
@@ -91,23 +95,7 @@ function buffer.mark_all_lines_dirty(self)
   self.unique_words_other_lines_dirty = true
   self.last_edit_first_line = 0
   self.last_edit_last_line = 0
-end
-
----Indexing buffer
-function buffer.index(self)
-  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-  -- NOTE: Pre-allocating self.lines_words here somehow wastes more memory, and
-  -- not doing that doesn't have a visible effect on performance. Win-win.
-  -- for i = 1, self.lines_count do
-  --   self.lines_words[i] = {}
-  -- end
-
-  if self.opts.indexing_interval < 1 then
-    self:index_range(0, self.lines_count)
-    self:mark_all_lines_dirty()
-  else
-    self:index_buffer_async()
-  end
+  self.words_distances_dirty = true
 end
 
 --- Workaround for https://github.com/neovim/neovim/issues/16729
@@ -119,7 +107,6 @@ function buffer.safe_buf_call(self, callback)
   end
 end
 
---- sync algorithm
 function buffer.index_range(self, range_start, range_end, skip_already_indexed)
   self:safe_buf_call(function()
     local chunk_size = self.GET_LINES_CHUNK_SIZE
@@ -137,55 +124,50 @@ function buffer.index_range(self, range_start, range_end, skip_already_indexed)
   end)
 end
 
---- async algorithm
-function buffer.index_buffer_async(self)
-  local chunk_start = 0
+function buffer.start_indexing_timer(self)
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+  self.timer_current_line = 0
 
-  -- This flag prevents vim.schedule() callbacks from piling up in the queue
-  -- when the indexing interval is very short.
-  local scheduled = false
-  self.timer = vim.loop.new_timer()
   -- Negative values result in an integer overflow in luv (vim.loop), and zero
   -- disables timer repeat, so only intervals larger than 1 are valid.
   local interval = math.max(1, self.opts.indexing_interval)
   self.timer:start(0, interval, function()
-    if scheduled then
+    if self.closed then
+      self:stop_indexing_timer()
       return
     end
-    scheduled = true
-    vim.schedule(function()
-      scheduled = false
-      if self.closed then
-        return
-      end
 
-      -- Note that the async indexer is designed to not break even if the user
-      -- is editing the file while it is in the process of being indexed.
-      -- Because the indexing in watcher must use the synchronous algorithm, we
-      -- assume that the data already present in self.lines_words to be correct
-      -- and doesn't need refreshing here because even if we do receive text
-      -- from nvim_buf_get_lines different from what the watcher has seen, it
-      -- will catch up on the next on_lines event.
+    -- Note that the async indexer is designed to not break even if the user is
+    -- editing the file while it is in the process of being indexed. Because
+    -- the indexing in watcher must use the synchronous algorithm, we assume
+    -- that the data already present in self.lines_words to be correct and
+    -- doesn't need refreshing here because even if we do receive text from
+    -- nvim_buf_get_lines different from what the watcher has seen so far, it
+    -- (the watcher) will catch up on the next on_lines event.
 
-      local line_count = vim.api.nvim_buf_line_count(self.bufnr)
-      -- Skip over the already indexed lines
-      while chunk_start < line_count and self.lines_words[chunk_start + 1] do
-        chunk_start = chunk_start + 1
-      end
-      local chunk_end = math.min(chunk_start + self.opts.indexing_chunk_size, line_count)
-      if chunk_end >= line_count then
-        self:stop_indexing_timer()
-      end
-      self:index_range(chunk_start, chunk_end, true)
-      chunk_start = chunk_end
-      self:mark_all_lines_dirty()
-      self.words_distances_dirty = true
-    end)
+    -- Skip over the already indexed lines
+    while self.lines_words[self.timer_current_line + 1] do
+      self.timer_current_line = self.timer_current_line + 1
+    end
+
+    local chunk_start = self.timer_current_line
+    local chunk_size = self.opts.indexing_chunk_size
+    -- NOTE: self.lines_count may be modified by the indexer.
+    local chunk_end = chunk_size >= 1 and math.min(chunk_start + chunk_size, self.lines_count) or self.lines_count
+    if chunk_end >= self.lines_count then
+      self:stop_indexing_timer()
+    end
+
+    self:index_range(chunk_start, chunk_end, true)
+    self.timer_current_line = chunk_end
+    self:mark_all_lines_dirty()
   end)
 end
 
 --- watch
 function buffer.watch(self)
+  self.lines_count = vim.api.nvim_buf_line_count(self.bufnr)
+
   -- NOTE: As far as I know, indexing in watching can't be done asynchronously
   -- because even built-in commands generate multiple consequent `on_lines`
   -- events, and I'm not even mentioning plugins here. To get accurate results
@@ -250,6 +232,25 @@ function buffer.watch(self)
       end
       self.lines_count = new_lines_count
 
+      -- This branch is support code for handling cases when the user is
+      -- editing the buffer while the async indexer is running. It solves the
+      -- problem that if new lines are inserted or old lines are deleted, the
+      -- indexes of each subsequent line will change, and so the indexer
+      -- current position must be adjusted to not accidentally skip any lines.
+      if self.timer:is_active() then
+        if first_line <= self.timer_current_line and self.timer_current_line < old_last_line then
+          -- The indexer was in the area of the current text edit. We will
+          -- synchronously index this area it in a moment, so the indexer
+          -- should resume from right after the edit range.
+          self.timer_current_line = new_last_line
+        elseif self.timer_current_line >= old_last_line then
+          -- The indexer was somewhere past the current text edit. This means
+          -- that the line numbers could have changed, and the indexing
+          -- position must be adjusted accordingly.
+          self.timer_current_line = self.timer_current_line + delta
+        end
+      end
+
       -- replace lines
       self:index_range(first_line, new_last_line)
 
@@ -270,25 +271,13 @@ function buffer.watch(self)
         return true
       end
 
-      -- The logic for adjusting lines list on buffer reloads is much simpler
-      -- because tables of all lines can be assumed to be fresh.
-      local new_lines_count = vim.api.nvim_buf_line_count(self.bufnr)
-      if new_lines_count > self.lines_count then -- append
-        -- Again, no need to pre-allocate, index_line will append new lines
-        -- itself.
-        -- for i = self.lines_count + 1, new_lines_count do
-        --   self.lines_words[i] = {}
-        -- end
-      elseif new_lines_count < self.lines_count then -- remove
-        for i = self.lines_count, new_lines_count + 1, -1 do
-          self.lines_words[i] = nil
-        end
+      -- clear all lines
+      for i = self.lines_count, 1, -1 do
+        self.lines_words[i] = nil
       end
-      self.lines_count = new_lines_count
 
-      self:index_range(0, self.lines_count)
-      self:mark_all_lines_dirty()
-      self.words_distances_dirty = true
+      self:stop_indexing_timer()
+      self:start_indexing_timer()
     end,
 
     on_detach = function(_, _)
diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua
index 46c0593..ed1447d 100644
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@@ -50,7 +50,7 @@ source.complete = function(self, params, callback)
   local processing = false
   local bufs = self:_get_buffers(opts)
   for _, buf in ipairs(bufs) do
-    if buf.timer then
+    if buf.timer:is_active() then
       processing = true
       break
     end
@@ -90,7 +90,7 @@ source._get_buffers = function(self, opts)
       new_buf.on_close_cb = function()
         self.buffers[bufnr] = nil
       end
-      new_buf:index()
+      new_buf:start_indexing_timer()
       new_buf:watch()
       self.buffers[bufnr] = new_buf
     end
diff --git a/lua/cmp_buffer/timer.lua b/lua/cmp_buffer/timer.lua
new file mode 100644
index 0000000..2d0b708
--- /dev/null
+++ b/lua/cmp_buffer/timer.lua
@@ -0,0 +1,48 @@
+---@class cmp_buffer.Timer
+---@field public handle any
+---@field private callback_wrapper_instance fun()|nil
+local timer = {}
+
+function timer.new()
+  local self = setmetatable({}, { __index = timer })
+  self.handle = vim.loop.new_timer()
+  self.callback_wrapper_instance = nil
+  return self
+end
+
+---@param timeout_ms number
+---@param repeat_ms number
+---@param callback fun()
+function timer:start(timeout_ms, repeat_ms, callback)
+  local scheduled = false
+  local function callback_wrapper()
+    if scheduled then
+      return
+    end
+    scheduled = true
+    vim.schedule(function()
+      scheduled = false
+      if self.callback_wrapper_instance ~= callback_wrapper then
+        return
+      end
+      callback()
+    end)
+  end
+  self.handle:start(timeout_ms, repeat_ms, callback_wrapper)
+  self.callback_wrapper_instance = callback_wrapper
+end
+
+function timer:stop()
+  self.handle:stop()
+  self.callback_wrapper_instance = nil
+end
+
+function timer:is_active()
+  return self.handle:is_active()
+end
+
+function timer:close()
+  self.handle:close()
+end
+
+return timer

From c8daddb987ba2e295be3eae48c91aeaba6a92170 Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Mon, 20 Dec 2021 12:37:28 +0200
Subject: [PATCH 4/8] rename indexing_chunk_size to indexing_batch_size

---
 lua/cmp_buffer/buffer.lua | 12 ++++++------
 lua/cmp_buffer/source.lua |  7 ++++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
index 8aacd61..26444a7 100644
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@@ -150,16 +150,16 @@ function buffer.start_indexing_timer(self)
       self.timer_current_line = self.timer_current_line + 1
     end
 
-    local chunk_start = self.timer_current_line
-    local chunk_size = self.opts.indexing_chunk_size
+    local batch_start = self.timer_current_line
+    local batch_size = self.opts.indexing_batch_size
     -- NOTE: self.lines_count may be modified by the indexer.
-    local chunk_end = chunk_size >= 1 and math.min(chunk_start + chunk_size, self.lines_count) or self.lines_count
-    if chunk_end >= self.lines_count then
+    local batch_end = batch_size >= 1 and math.min(batch_start + batch_size, self.lines_count) or self.lines_count
+    if batch_end >= self.lines_count then
       self:stop_indexing_timer()
     end
 
-    self:index_range(chunk_start, chunk_end, true)
-    self.timer_current_line = chunk_end
+    self:index_range(batch_start, batch_end, true)
+    self.timer_current_line = batch_end
     self:mark_all_lines_dirty()
   end)
 end
diff --git a/lua/cmp_buffer/source.lua b/lua/cmp_buffer/source.lua
index ed1447d..df0ddbe 100644
--- a/lua/cmp_buffer/source.lua
+++ b/lua/cmp_buffer/source.lua
@@ -4,7 +4,7 @@ local buffer = require('cmp_buffer.buffer')
 ---@field public keyword_length number
 ---@field public keyword_pattern string
 ---@field public get_bufnrs fun(): number[]
----@field public indexing_chunk_size number
+---@field public indexing_batch_size number
 ---@field public indexing_interval number
 
 ---@type cmp_buffer.Options
@@ -14,7 +14,7 @@ local defaults = {
   get_bufnrs = function()
     return { vim.api.nvim_get_current_buf() }
   end,
-  indexing_chunk_size = 1000,
+  indexing_batch_size = 1000,
   indexing_interval = 100,
 }
 
@@ -33,7 +33,7 @@ source._validate_options = function(_, params)
     keyword_length = { opts.keyword_length, 'number' },
     keyword_pattern = { opts.keyword_pattern, 'string' },
     get_bufnrs = { opts.get_bufnrs, 'function' },
-    indexing_chunk_size = { opts.indexing_chunk_size, 'number' },
+    indexing_batch_size = { opts.indexing_batch_size, 'number' },
     indexing_interval = { opts.indexing_interval, 'number' },
   })
   return opts
@@ -82,6 +82,7 @@ source.complete = function(self, params, callback)
 end
 
 ---@param opts cmp_buffer.Options
+---@return cmp_buffer.Buffer[]
 source._get_buffers = function(self, opts)
   local buffers = {}
   for _, bufnr in ipairs(opts.get_bufnrs()) do

From 0226d443a5332b1de12b2802883115b9c7f6b491 Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Mon, 20 Dec 2021 12:37:41 +0200
Subject: [PATCH 5/8] completely rewrite the documentation for the new options

---
 README.md | 97 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 26025fc..a08c1c9 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ The below source configuration are available. To set any of these options, do:
 ```lua
 cmp.setup({
   sources = {
-    { 
+    {
       name = 'buffer',
       option = {
         -- Options go into this table
@@ -111,54 +111,16 @@ end
 
 ### indexing_interval (type: number)
 
-_Default:_ `200`
+_Default:_ `100`
 
-The rate (in milliseconds) at which buffers are scanned for words when they are first opened.
-Setting this interval to lower values will increase the speed of indexing, but at the expense of
-higher CPU usage. By default indexing happens asynchronously, but setting this option to zero or
-a negative value will switch indexing to a synchronous algorithm, which uses significantly less
-RAM on big files and takes less time in total (to index the entire file), with the obvious
-downside of blocking the user interface for a second or two. On small files (up to tens of
-thousands of lines, probably) the difference will be unnoticeable, though.
+Advanced option. See the section [Indexing](#indexing).
 
 
-### indexing_chunk_size (type: number)
+### indexing_batch_size (type: number)
 
 _Default:_ `1000`
 
-The number of lines processed in batch every `indexing_interval` milliseconds. Setting it to
-higher values will make indexing faster, but at the cost of responsiveness of the UI. When using
-the synchronous mode, changing this option may improve memory usage, though the default value has
-been tested to be pretty good in this regard.
-
-Please note that the `indexing_interval` and `indexing_chunk_size` are advanced options, change
-them only if you experience performance or RAM usage problems (or need to work on particularly
-large files) and be sure to measure the results!
-
-
-## Performance on large text files
-
-This source has been tested on code files of a few megabytes in size (5-10) and it has been
-optimized for them, however, the indexed words can still take up tens of megabytes of RAM if the
-file is big (on small files it _will not be more_ than a couple of megabytes, typically much
-less). So if you wish to avoid accidentally wasting lots of RAM when editing big files, you can
-tweak `get_bufnrs`, for example like this:
-
-```lua
-get_bufnrs = function()
-  local buf = vim.api.nvim_get_current_buf()
-  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
-  if byte_size > 1024 * 1024 then -- 1 Megabyte max
-    return {}
-  end
-  return { buf }
-end
-```
-
-Of course, this snippet can be combined with any other recipes for `get_bufnrs`.
-
-As another tip, turning on the synchronous indexing mode is very likely to help with reducing
-memory usage, see the `indexing_interval` option.
+Advanced option. See the section [Indexing](#indexing).
 
 
 ## Locality bonus comparator (distance-based sorting)
@@ -185,3 +147,52 @@ cmp.setup({
   }
 })
 ```
+
+
+## Indexing
+
+When a buffer is opened, this source first has to scan all lines in the buffer, match all words
+and store all of their occurrences. This process is called _indexing_. When actually editing the
+text in the buffer, the index of words is kept up-to-date with changes to the buffer's contents,
+this is called _watching_. It is done by re-running the indexer on just the changed lines.
+Indexing happens completely asynchronously in background, unlike watching, which must be performed
+synchronously to ensure that the index of words is kept perfectly in-sync with the lines in the
+buffer. However, most of the time this will not be a problem since many typical text edit
+operations affect only one or two lines, unless you are pasting a 1000-line snippet.
+
+_Note that you can freely edit the buffer while it is being indexed_, the underlying algorithm is
+written in such a way that your changes will not break the index or cause errors. If a crash does
+happen - it is a bug, so please report it.
+
+The speed of indexing is configurable with two options: `indexing_interval` and
+`indexing_batch_size`. Essentially, when indexing, a timer is started, which pulls a batch of
+`indexing_batch_size` lines from the buffer, scans them for words, and repeats after
+`indexing_interval` milliseconds. Decreasing interval and/or increasing the batch size will make
+the indexer faster, but at the expense of higher CPU usage and more lag when editing the file
+while indexing is still in progress. Setting `indexing_batch_size` to a negative value will switch
+the indexer to the "synchronous" mode: this will process all lines in one go, take less time in
+total (since no other code will be running on the Lua thread), but with the obvious downside that
+the editor UI will be blocked.
+
+### Performance on large text files
+
+This source has been tested on code files of a few megabytes in size (5-10) and contains
+optimizations for them, however, the indexed words can still take up tens of megabytes of RAM if
+the file is large. It also currently has troubles on files with very long lines, see issue
+[#13](https://github.com/hrsh7th/cmp-buffer/issues/13).
+
+So, if you wish to avoid accidentally running this source on big files, you can tweak
+`get_bufnrs`, for example like this:
+
+```lua
+get_bufnrs = function()
+  local buf = vim.api.nvim_get_current_buf()
+  local byte_size = vim.api.nvim_buf_get_offset(buf, vim.api.nvim_buf_line_count(buf))
+  if byte_size > 1024 * 1024 then -- 1 Megabyte max
+    return {}
+  end
+  return { buf }
+end
+```
+
+Of course, this snippet can be combined with any other recipes for `get_bufnrs`.

From 70713e1ca477cabbb1d974929add56f3e0556eff Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Mon, 20 Dec 2021 13:52:53 +0200
Subject: [PATCH 6/8] add comments to the new Timer class

---
 lua/cmp_buffer/timer.lua | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lua/cmp_buffer/timer.lua b/lua/cmp_buffer/timer.lua
index 2d0b708..3ed83c7 100644
--- a/lua/cmp_buffer/timer.lua
+++ b/lua/cmp_buffer/timer.lua
@@ -1,3 +1,18 @@
+---This timer matches the semantics of setInterval and clearInterval of
+---Javascript. It provides a more reliable alternative to vim.loop.timer_start
+---with a callback wrapped into a vim.schedule call by addressing two problems:
+---1. Scheduled callbacks are invoked less frequently than a libuv timer with a
+---   small interval (1-5ms). This causes those callbacks to fill up the queue
+---   in the event loop, and so the callback function may get invoked multiple
+---   times on one event loop tick. In contrast, Javascript's setInterval
+---   guarantees that the callback is not invoked more frequently than the
+---   interval.
+---2. When a libuv timer is stopped with vim.loop.timer_stop, it doesn't affect
+---   the callbacks that have already been scheduled. So timer_stop will not
+---   immediately stop the timer, the actual callback function will run one
+---   more time until it is finally stopped. This implementation ensures that
+---   timer_stop prevents any subsequent invocations of the callback.
+---
 ---@class cmp_buffer.Timer
 ---@field public handle any
 ---@field private callback_wrapper_instance fun()|nil
@@ -14,7 +29,11 @@ end
 ---@param repeat_ms number
 ---@param callback fun()
 function timer:start(timeout_ms, repeat_ms, callback)
+  -- This is the flag that fixes problem 1.
   local scheduled = false
+  -- Creating a function on every call to timer_start ensures that we can always
+  -- detect when a different callback is set by calling timer_start and prevent
+  -- the old one from being invoked.
   local function callback_wrapper()
     if scheduled then
       return
@@ -22,6 +41,7 @@ function timer:start(timeout_ms, repeat_ms, callback)
     scheduled = true
     vim.schedule(function()
       scheduled = false
+      -- Either a different callback was set, or the timer has been stopped.
       if self.callback_wrapper_instance ~= callback_wrapper then
         return
       end

From 077d7de49a9d6244d67bd6a1e08206cda9f7aa0c Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Fri, 24 Dec 2021 14:49:15 +0200
Subject: [PATCH 7/8] use clear_table for clearing lines list in on_reload

---
 lua/cmp_buffer/buffer.lua | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
index 26444a7..fd94436 100644
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@@ -1,5 +1,11 @@
 local timer = require('cmp_buffer.timer')
 
+local function clear_table(tbl)
+  for k in pairs(tbl) do
+    tbl[k] = nil
+  end
+end
+
 ---@class cmp_buffer.Buffer
 ---@field public bufnr number
 ---@field public opts cmp_buffer.Options
@@ -271,10 +277,7 @@ function buffer.watch(self)
         return true
       end
 
-      -- clear all lines
-      for i = self.lines_count, 1, -1 do
-        self.lines_words[i] = nil
-      end
+      clear_table(self.lines_words)
 
       self:stop_indexing_timer()
       self:start_indexing_timer()
@@ -289,12 +292,6 @@ function buffer.watch(self)
   })
 end
 
-local function clear_table(tbl)
-  for k in pairs(tbl) do
-    tbl[k] = nil
-  end
-end
-
 ---@param linenr number
 ---@param line string
 function buffer.index_line(self, linenr, line)

From eba65f6fdabf294fd598a8ff688d342ff330d5ea Mon Sep 17 00:00:00 2001
From: Dmytro Meleshko <dmytro.meleshko@gmail.com>
Date: Fri, 24 Dec 2021 14:53:02 +0200
Subject: [PATCH 8/8] place index_range calls as the last statement where it is
 called

---
 lua/cmp_buffer/buffer.lua | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lua/cmp_buffer/buffer.lua b/lua/cmp_buffer/buffer.lua
index fd94436..b530e64 100644
--- a/lua/cmp_buffer/buffer.lua
+++ b/lua/cmp_buffer/buffer.lua
@@ -163,10 +163,10 @@ function buffer.start_indexing_timer(self)
     if batch_end >= self.lines_count then
       self:stop_indexing_timer()
     end
-
-    self:index_range(batch_start, batch_end, true)
     self.timer_current_line = batch_end
     self:mark_all_lines_dirty()
+
+    self:index_range(batch_start, batch_end, true)
   end)
 end
 
@@ -257,9 +257,6 @@ function buffer.watch(self)
         end
       end
 
-      -- replace lines
-      self:index_range(first_line, new_last_line)
-
       if first_line == self.last_edit_first_line and old_last_line == self.last_edit_last_line and new_last_line == self.last_edit_last_line then
         self.unique_words_curr_line_dirty = true
       else
@@ -270,6 +267,9 @@ function buffer.watch(self)
       self.last_edit_last_line = new_last_line
 
       self.words_distances_dirty = true
+
+      -- replace lines
+      self:index_range(first_line, new_last_line)
     end,
 
     on_reload = function(_, _)