Merge pull request #999 from Guldoman/tokenizer_regex_groups

Allow regexes in `tokenizer` to split tokens with groups
author: Jefferson González <jgmdev@gmail.com> 2022-05-31 11:04:48 -0400
committer: GitHub <noreply@github.com> 2022-05-31 11:04:48 -0400
commit: bd742d5b3d83ebfef80efc7b8e1c63eaa1721814 (patch)
tree: d384d992209751812d5453473ee14451eee5dd2d
parent: 4f0d45d6ab10cf86ff711fb2e2a66145e9021055 (diff)
parent: d8efb1ab53c7e6414d78230219f5ae6655b8b9b0 (diff)
download: lite-xl-bd742d5b3d83ebfef80efc7b8e1c63eaa1721814.tar.gz
lite-xl-bd742d5b3d83ebfef80efc7b8e1c63eaa1721814.zip
3 files changed, 30 insertions, 3 deletions
diff --git a/data/core/regex.lua b/data/core/regex.lua
index 637d23fd..fa85d56c 100644
--- a/data/core/regex.lua
+++ b/data/core/regex.lua
@@ -5,8 +5,9 @@ regex.__index = function(table, key) return regex[key]; end
 regex.match = function(pattern_string, string, offset, options)
   local pattern = type(pattern_string) == "table" and
     pattern_string or regex.compile(pattern_string)
-  local s, e = regex.cmatch(pattern, string, offset or 1, options or 0)
-  return s, e and e - 1
+  local res = { regex.cmatch(pattern, string, offset or 1, options or 0) }
+  res[2] = res[2] and res[2] - 1
+  return table.unpack(res)
 end
 
 -- Will iterate back through any UTF-8 bytes so that we don't replace bits
diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index ebe550ff..555d60b5 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -1,9 +1,12 @@
+local core = require "core"
 local syntax = require "core.syntax"
 local common = require "core.common"
 
 local tokenizer = {}
+local bad_patterns = {}
 
 local function push_token(t, type, text)
+  type = type or "normal"
   local prev_type = t[#t-1]
   local prev_text = t[#t]
   if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then
@@ -173,6 +176,20 @@ function tokenizer.tokenize(incoming_syntax, text, state)
         or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
       if p.regex and #res > 0 then -- set correct utf8 len for regex result
         res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
+        -- `regex.match` returns group results as a series of `begin, end`
+        -- we only want `begin`s
+        if #res >= 3 then
+          res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1
+        end
+        for i=1,(#res-3) do
+          local curr = i + 3
+          local from = i * 2 + 3
+          if from < #res then
+            res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1
+          else
+            res[curr] = nil
+          end
+        end
         res[1] = next
       end
       if res[1] and close and target[3] then
@@ -242,6 +259,15 @@ function tokenizer.tokenize(incoming_syntax, text, state)
     local matched = false
     for n, p in ipairs(current_syntax.patterns) do
       local find_results = { find_text(text, p, i, true, false) }
+      if #find_results - 1 > #p.type then
+        if not bad_patterns[current_syntax] then
+          bad_patterns[current_syntax] = { }
+        end
+        if not bad_patterns[current_syntax][n] then
+          bad_patterns[current_syntax][n] = true
+          core.error("Malformed pattern #%d in %s language plugin", n, current_syntax.name or "unnamed")
+        end
+      end
       if find_results[1] then
         -- matched pattern; make and add tokens
         push_tokens(res, current_syntax, p, text, find_results)
diff --git a/src/api/regex.c b/src/api/regex.c
index 6a0aac7a..d23eaf71 100644
--- a/src/api/regex.c
+++ b/src/api/regex.c
@@ -88,7 +88,7 @@ static int f_pcre_match(lua_State *L) {
     return 0;
   }
   for (int i = 0; i < rc*2; i++)
-    lua_pushnumber(L, ovector[i]+offset+1);
+    lua_pushinteger(L, ovector[i]+offset+1);
   pcre2_match_data_free(md);
   return rc*2;
 }
author	Jefferson González <jgmdev@gmail.com>	2022-05-31 11:04:48 -0400
committer	GitHub <noreply@github.com>	2022-05-31 11:04:48 -0400
commit	bd742d5b3d83ebfef80efc7b8e1c63eaa1721814 (patch)
tree	d384d992209751812d5453473ee14451eee5dd2d
parent	4f0d45d6ab10cf86ff711fb2e2a66145e9021055 (diff)
parent	d8efb1ab53c7e6414d78230219f5ae6655b8b9b0 (diff)
download	lite-xl-bd742d5b3d83ebfef80efc7b8e1c63eaa1721814.tar.gz lite-xl-bd742d5b3d83ebfef80efc7b8e1c63eaa1721814.zip