QScintilla中的Pygsments

Question

Consider this mcve: 考虑一下这个mcve：

import math
import sys
import textwrap
import time
from pathlib import Path
from collections import defaultdict

from PyQt5.Qsci import QsciLexerCustom, QsciScintilla
from PyQt5.Qt import *

from pygments import lexers, styles, highlight, formatters
from pygments.lexer import Error, RegexLexer, Text, _TokenType
from pygments.style import Style


EXTRA_STYLES = {
    "monokai": {
        "background": "#272822",
        "caret": "#F8F8F0",
        "foreground": "#F8F8F2",
        "invisibles": "#F8F8F259",
        "lineHighlight": "#3E3D32",
        "selection": "#49483E",
        "findHighlight": "#FFE792",
        "findHighlightForeground": "#000000",
        "selectionBorder": "#222218",
        "activeGuide": "#9D550FB0",
        "misspelling": "#F92672",
        "bracketsForeground": "#F8F8F2A5",
        "bracketsOptions": "underline",
        "bracketContentsForeground": "#F8F8F2A5",
        "bracketContentsOptions": "underline",
        "tagsOptions": "stippled_underline",
    }
}


def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"


class ViewLexer(QsciLexerCustom):

    def __init__(self, lexer_name, style_name):
        super().__init__()

        # Lexer + Style
        self.pyg_style = styles.get_style_by_name(style_name)
        self.pyg_lexer = lexers.get_lexer_by_name(lexer_name, stripnl=False)
        self.cache = {
            0: ('root',)
        }
        self.extra_style = EXTRA_STYLES[style_name]

        # Generate QScintilla styles
        self.font = QFont("Consolas", 8, weight=QFont.Bold)
        self.token_styles = {}
        index = 0
        for k, v in self.pyg_style:
            self.token_styles[k] = index
            if v.get("color", None):
                self.setColor(QColor(f"#{v['color']}"), index)
            if v.get("bgcolor", None):
                self.setPaper(QColor(f"#{v['bgcolor']}"), index)

            self.setFont(self.font, index)
            index += 1

    def defaultPaper(self, style):
        return QColor(self.extra_style["background"])

    def language(self):
        return self.pyg_lexer.name

    def get_tokens_unprocessed(self, text, stack=('root',)):
        """
        Split ``text`` into (tokentype, text) pairs.

        ``stack`` is the inital stack (default: ``['root']``)
        """
        lexer = self.pyg_lexer
        pos = 0
        tokendefs = lexer._tokens
        statestack = list(stack)
        statetokens = tokendefs[statestack[-1]]
        while 1:
            for rexmatch, action, new_state in statetokens:
                m = rexmatch(text, pos)
                if m:
                    if action is not None:
                        if type(action) is _TokenType:
                            yield pos, action, m.group()
                        else:
                            for item in action(lexer, m):
                                yield item
                    pos = m.end()
                    if new_state is not None:
                        # state transition
                        if isinstance(new_state, tuple):
                            for state in new_state:
                                if state == '#pop':
                                    statestack.pop()
                                elif state == '#push':
                                    statestack.append(statestack[-1])
                                else:
                                    statestack.append(state)
                        elif isinstance(new_state, int):
                            # pop
                            del statestack[new_state:]
                        elif new_state == '#push':
                            statestack.append(statestack[-1])
                        else:
                            assert False, "wrong state def: %r" % new_state
                        statetokens = tokendefs[statestack[-1]]
                    break
            else:
                # We are here only if all state tokens have been considered
                # and there was not a match on any of them.
                try:
                    if text[pos] == '\n':
                        # at EOL, reset state to "root"
                        statestack = ['root']
                        statetokens = tokendefs['root']
                        yield pos, Text, u'\n'
                        pos += 1
                        continue
                    yield pos, Error, text[pos]
                    pos += 1
                except IndexError:
                    break

    def highlight_slow(self, start, end):
        style = self.pyg_style
        view = self.editor()
        code = view.text()[start:]
        tokensource = self.get_tokens_unprocessed(code)

        self.startStyling(start)
        for _, ttype, value in tokensource:
            self.setStyling(len(value), self.token_styles[ttype])

    def styleText(self, start, end):
        view = self.editor()
        t_start = time.time()
        self.highlight_slow(start, end)
        t_elapsed = time.time() - t_start
        len_text = len(view.text())
        text_size = convert_size(len_text)
        view.setWindowTitle(f"Text size: {len_text} - {text_size} Elapsed: {t_elapsed}s")

    def description(self, style_nr):
        return str(style_nr)


class View(QsciScintilla):

    def __init__(self, lexer_name, style_name):
        super().__init__()
        view = self

        # -------- Lexer --------
        self.setEolMode(QsciScintilla.EolUnix)
        self.lexer = ViewLexer(lexer_name, style_name)
        self.setLexer(self.lexer)

        # -------- Shortcuts --------
        self.text_size = 1
        self.s1 = QShortcut(f"ctrl+1", view, self.reduce_text_size)
        self.s2 = QShortcut(f"ctrl+2", view, self.increase_text_size)
        # self.gen_text()

        # # -------- Multiselection --------
        self.SendScintilla(view.SCI_SETMULTIPLESELECTION, True)
        self.SendScintilla(view.SCI_SETMULTIPASTE, 1)
        self.SendScintilla(view.SCI_SETADDITIONALSELECTIONTYPING, True)

        # -------- Extra settings --------
        self.set_extra_settings(EXTRA_STYLES[style_name])

    def get_line_separator(self):
        m = self.eolMode()
        if m == QsciScintilla.EolWindows:
            eol = '\r\n'
        elif m == QsciScintilla.EolUnix:
            eol = '\n'
        elif m == QsciScintilla.EolMac:
            eol = '\r'
        else:
            eol = ''
        return eol

    def set_extra_settings(self, dct):
        self.setIndentationGuidesBackgroundColor(QColor(0, 0, 255, 0))
        self.setIndentationGuidesForegroundColor(QColor(0, 255, 0, 0))

        if "caret" in dct:
            self.setCaretForegroundColor(QColor(dct["caret"]))

        if "line_highlight" in dct:
            self.setCaretLineBackgroundColor(QColor(dct["line_highlight"]))

        if "brackets_background" in dct:
            self.setMatchedBraceBackgroundColor(QColor(dct["brackets_background"]))

        if "brackets_foreground" in dct:
            self.setMatchedBraceForegroundColor(QColor(dct["brackets_foreground"]))

        if "selection" in dct:
            self.setSelectionBackgroundColor(QColor(dct["selection"]))

        if "background" in dct:
            c = QColor(dct["background"])
            self.resetFoldMarginColors()
            self.setFoldMarginColors(c, c)

    def increase_text_size(self):
        self.text_size *= 2
        self.gen_text()

    def reduce_text_size(self):
        if self.text_size == 1:
            return
        self.text_size //= 2
        self.gen_text()

    def gen_text(self):
        content = Path(__file__).read_text()
        while len(content) < self.text_size:
            content *= 2
        self.setText(content[:self.text_size])


if __name__ == '__main__':
    app = QApplication(sys.argv)
    view = View("python", "monokai")
    view.setText(textwrap.dedent("""\
        '''
        Ctrl+1 = You'll decrease the size of existing text
        Ctrl+2 = You'll increase the size of existing text

        Warning: Check the window title to see how long it takes rehighlighting
        '''
    """))
    view.resize(800, 600)
    view.show()
    app.exec_()

To run it you need to install: 要运行它，您需要安装：

QScintilla==2.10.8
Pygments==2.3.1
PyQt5==5.12

I'm trying to figure out how to use pygments on a QScintilla widget and right now the main problem I need to solve is the performance when dealing with non-tiny documents. 我试图弄清楚如何在QScintilla小部件上使用pygments，现在我需要解决的主要问题是处理非小型文档时的性能。

I'd like the editor to become responsive & usable when dealing with large documents (>=100kb) but I don't know very well what's the approach I should take here. 我希望编辑器在处理大型文档（> = 100kb）时变得响应和可用，但我不太清楚我应该采取什么方法。 In order to test performance you can use Ctrl + 1 or Ctrl + 2 and the widget text will be decreased/increased respectively. 为了测试性能，您可以使用Ctrl + 1或Ctrl + 2 ，小部件文本将分别减少/增加。

When I say "responsive" I mean that the highlighting computation of the visible screen should take no longer of [1-2]frame/highglight <=> [17-34]ms/highlight (assuming 60fps) so when typing you won't feel any slowdown. 当我说“响应”时，我的意思是可见屏幕的突出显示计算不再需要[1-2]帧/高亮度<=> [17-34] ms /高亮显示（假设为60fps），所以在打字时你赢了'感到任何减速。

Note: As you can see in the above mcve, I've included the pygments tokenizer so you can play around with it... it feels like in order to achieve "real-time highlighting" I'd need to use memoization/caching in some smart way but I'm struggling to figure out what's the data I need to cache and what's the best way to cache it... :/ 注意：正如你在上面的mcve中看到的，我已经包含了pygments tokenizer，所以你可以玩它...感觉就像为了实现“实时突出显示”我需要使用memoization / caching以某种聪明的方式，但我正在努力弄清楚我需要缓存的数据是什么，以及缓存它的最佳方法是什么......：/

Demo: 演示：

In the above demo you can see using this naive highlighting the editor will become unusable very soon, in my laptop rehighlighting text chunks of 32kb is still giving interactive framerate but with something higher than that the editor becomes completely unusable. 在上面的演示中，您可以看到使用这个天真的突出显示编辑器将很快变得无法使用，在我的笔记本电脑重新突出显示32kb的文本块仍然提供交互式帧速率但高于编辑器变得完全无法使用的东西。

CONSIDERATIONS: 注意事项：

The most typical case will happen when you're typing/coding on the visible screen with no selections 当您在可见屏幕上键入/编码而没有选择时，最典型的情况会发生
It may happen you're editing multiple selections spread over the whole document, which means you won't know if these selections are near the visible screen or not. 您可能正在编辑遍布整个文档的多个选项，这意味着您将不知道这些选择是否在可见屏幕附近。 For instance, in Sublime when you press Alt+F3 you select all ocurrences under cursor 例如，在Sublime中按Alt+F3 ，选择光标下的所有出现次数
In the above snippet I've used a python lexer but the algorithm shouldn't focus too much on that one. 在上面的片段中，我使用了python词法分析器，但算法不应过多关注那个。 Pygments support ~300 lexers afterall Pygments支持约300个词法分子
The worst case scenario would happen if the visible screen is at the end of the file and one of the selections happens to live at the beginning of the screen... In case you need to rehighlight the whole document you'd need to find an alternative way even if that means the "highlighting" is not correct on the first pass 如果可见屏幕位于文件的末尾并且其中一个选项恰好位于屏幕的开头，则会出现最糟糕的情况......如果您需要重新点亮整个文档，则需要找到一个替代方式，即使这意味着第一次通过时“突出显示”不正确
The most important is performance but also correctness... that is, if you give enough time the whole document should become highlighted correctly 最重要的是性能，但也是正确性......也就是说，如果你给予足够的时间，整个文档应该正确突出显示

REFERENCES: 参考文献：

The following documents are not specific to this particular problem but they talk about possible strategies of caching and syntax highlighting: 以下文档并非特定于此特定问题，但它们讨论了缓存和语法突出显示的可能策略：

Answer 1

In highlight_slow , you're receiving start and end values, but you're ignoring the end value. 在highlight_slow ，您正在接收start值和end值，但是您忽略了结束值。 As a result, any time you type a single character, the code is rehighlighting the entire rest of the buffer. 因此，只要您键入单个字符，代码就会重新点亮缓冲区的其余部分。 This is why, if you type at the end of a long buffer, the timing is very fast - around .1 - .2 ms - but if you type at the beginning, it's very slow. 这就是为什么，如果你在一个长缓冲区的末尾键入，时间非常快 - 大约.1 - .2毫秒 - 但如果你在开头键入，它会非常慢。

Thinking just in terms of correct highlighting, in most cases (with Python, at least) when you introduce a new character only the current line needs to be re-styled. 只考虑正确突出显示，在大多数情况下（至少使用Python），当您引入新角色时，只需重新设置当前行。 Sometimes, like if you start a function definition or open a bracket, multiple lines might need to be styled. 有时，就像您启动函数定义或打开括号一样，可能需要设置多行样式。 Only when you open or close a multiline """ or ''' string - will the rest of the buffer need to be restyled. 只有当您打开或关闭多行"""或'''字符串时，才需要重新设置缓冲区的其余部分。

If you include start and end in your logging, you'll see that most of the time when you type they span a very small range. 如果在日志记录中包含start和end ，您会看到大多数情况下，当您键入它们时，它们的范围非常小。 If you change one line of your highlight_code method from 如果你改变了highlight_code方法的一行

code = view.text()[start:]

to 至

code = view.text()[start:end]

you'll see that the method almost always take sub-millisecond time now, and it almost always gets the highlighting correct. 你会发现这个方法现在几乎总是花费不到一毫秒的时间，并且它几乎总能得到正确的突出显示。

From what I've been able to tell, this only gets the styling wrong when multiline quotes are involved. 从我所能说的，当涉及多行引用时，这只会导致样式错误。 However, your current code has the same problem: try opening a multiline string, typing enter, and continuing the string on the next line. 但是，您当前的代码有同样的问题：尝试打开多行字符串，输入enter，然后继续下一行的字符串。 The second line will be highlighted as code. 第二行将突出显示为代码。 Qscintilla is leading you astray a bit here, by giving a start that does not include the beginning of the multiline quote. Qscintilla这里带坏你一点，通过给start不包括多报价的开始。 It's not trying to be perfect, though - the docs say 然而，文件说，这不是试图完美

In fact, QScintilla says: “Hey, I think you should restyle the text between the character at position start up to the character at position end“. 事实上，QScintilla说：“嘿，我认为你应该在位置开头的角色和位置末端的角色之间重新设置文字”。 You are completely free to ignore this suggestion. 你可以完全自由地忽略这个建议。

Handling mutliline quoting correctly will be a bit tricky! 正确处理mutliline引用会有点棘手！ If it were me, and I wanted to get something working quickly, I'd probably impement a keystroke to refresh the highlighting for the entire buffer and use that when things look wrong. 如果是我，并且我希望能够快速完成某些工作，我可能需要通过按键来刷新整个缓冲区的突出显示，并在出现问题时使用它。

Answer 2

If you're happy to write your own syntax highlighter, here's a possible way of speeding it up dramatically. 如果您乐意编写自己的语法高亮显示器，可以采用一种方法来加速它。 You can do this with Pygments with a little effort; 你可以用一点点努力来完成Pygments; see the bottom of the answer for one possible way of doing this. 看到答案的底部有一种可行的方法。

The syntax highlighter is simple. 语法高亮显示器很简单。 It has a small internal data structure, representing the current context, which it updates as it goes along. 它有一个小的内部数据结构，代表当前的上下文，它随着它的进行更新。 So, for the following Python code: 因此，对于以下Python代码：

import time

def sleep_ms(ms):
    """sleeps for a length of time
    given in milliseconds"""

    time.sleep(
        ms / 1000
    )

sleep_ms(1000)
syntax error

its context might change like this, as it goes through the tokens¹: 它的上下文可能会像这样改变，因为它经历了令牌¹：

>>> [nothing]
>>> IMPORT
    IMPORT modulename
>>> [nothing]
>>> DEF
    DEF functionname
    DEF functionname, OPENPAREN
    DEF functionname, OPENPAREN
    DEF functionname ARGLIST
    DEF functionname ARGLIST COLON
>>> FUNCBODY 4s
    FUNCBODY 4s, DOUBLE_MLSTR
>>> FUNCBODY 4s, DOUBLE_MLSTR
    FUNCBODY 4s
>>> FUNCBODY 4s
>>> FUNCBODY 4s, varname
    FUNCBODY 4s, varname ATTR
    FUNCBODY 4s, varname ATTR attrname
    FUNCBODY 4s, varname ATTR attrname, OPENPAREN
>>> FUNCBODY 4s, varname ATTR attrname, OPENPAREN
>>> FUNCBODY 4s, varname ATTR attrname, OPENPAREN, varname
    FUNCBODY 4s, varname ATTR attrname, OPENPAREN, TRUEDIV varname
    FUNCBODY 4s, varname ATTR attrname, OPENPAREN, TRUEDIV varname intliteral
>>> FUNCBODY 4s, FUNCCALL
>>> FUNCBODY 4s
>>> [nothing]
    varname
    varname, OPENPAREN
    varname, OPENPAREN, intliteral
    FUNCCALL
>>> [nothing]
    varname
    ERROR

If you cache the final contexts of each line, then you can start the syntax highlighting at the line that changed and keep going until you get to a line where the context is the same as is cached; 如果你缓存每一行的最终上下文，那么你可以在更改的行开始语法高亮显示并继续前进，直到你到达一个上下文与缓存相同的行; you don't have to recompute the whole file, but if you add something like """ then it'll recompute until the end. If you get to an ERROR then you can just stop there; there's no point recalculating the syntax highlighting past a syntax error, because you don't know what the context's meant to be. (For the initial version when you open the file, you could assume that there's no context after a syntax error; this heuristic seems to work well enough.) 你不必重新计算整个文件，但是如果你添加类似"""东西那么它会重新计算直到结束。如果你得到一个ERROR然后你就可以停在那里;没有必要重新计算突出显示过去的语法语法错误，因为您不知道上下文的含义。（对于打开文件时的初始版本，您可以假设在语法错误之后没有上下文;此启发式似乎运行得很好。）

This syntax highlighter has the potential to be ridiculously accurate, or just "good enough", with virtually no perceivable difference in speed between the two. 这种语法高亮显示器有可能是荒谬的准确，或只是“足够好”，两者之间的速度几乎没有可察觉的差异。 Language-specific highlighters could even be dynamically linked plugins, and it'd still be reasonably fast! 特定语言的荧光笔甚至可以是动态链接的插件，它仍然相当快！ Additionally, if you add debouncing for highlighting of subsequent lines, typing """""" quickly enough will be just as fast as typing "" or 42 , no matter how big the file is. 此外，如果您为后续行的突出显示添加去抖动 ，那么快速键入""""""将与键入""或42一样快，无论文件有多大。

Note that this highlighter is single-pass – it doesn't highlight known variable names differently to unknown ones, for example. 请注意，此突出显示器是单通道的 - 例如，它不会突出显示已知变量名称与未知变量名称的不同。 If you wish to do this, the problem becomes considerably harder. 如果你想这样做，问题会变得更加困难。

¹: This example Python highlighter is a "ridiculously accurate" one; ¹：这个示例Python荧光笔是一个“荒谬准确”的; I probably wouldn't go with something like this if I had a time limit. 如果我有时间限制，我可能不会选择这样的东西。 Nevertheless, I've got it planned out in my head and – for now, at least – could explain it in detail if required. 尽管如此，我已经计划好了 - 至少现在 - 如果需要的话，可以详细解释它。

Your code requires surprisingly few changes to work with this technique. 您的代码需要很少的更改才能使用此技术。

Change the beginning of your get_tokens_unprocessed to: 将get_tokens_unprocessed的开头更改为：

  def get_tokens_unprocessed(self, text, stack=('root',), mutate_stack=False): """ Split ``text`` into (tokentype, text) pairs. ``stack`` is the inital stack (default: ``['root']``) """ lexer = self.pyg_lexer pos = 0 tokendefs = lexer._tokens if not mutate_stack: statestack = list(stack) statetokens = tokendefs[statestack[-1]]

Find some way of detecting the line number. 找到一些检测行号的方法。
In highlight_slow 's loop, do something like this (except better): 在highlight_slow的循环中，做这样的事情（除了更好）：
```
  stack = list(self.cache[line_no_of(start)]) tokensource = self.get_tokens_unprocessed(code, stack, True) self.startStyling(start) pos = start; for _, ttype, value in tokensource: self.setStyling(len(value), self.token_styles[ttype]) pos += len(value) if is_line_end(pos): if pos >= end and stack == self.cache[line_no_of(start)]: break self.cache[line_no_of(start)] = tuple(stack) 
```
Obviously, the code would have to be better than this, and you'd have to find some efficient way of implementing is_line_end and line_no_of ; 显然，代码必须比这更好，你必须找到一些有效的方法来实现is_line_end和line_no_of ; there's probably some Pygments way of doing this. 可能有一些Pygments这样做的方式。

This solution has at least one benefit over yours already: it supports multi-line comments. 此解决方案至少比您的解决方案有一个好处：它支持多行注释。

QScintilla中的Pygsments

问题描述

2 个解决方案

解决方案1
19 2019-04-26 18:35:28

解决方案2
1 2019-04-28 14:33:26

QScintilla中的Pygsments

问题描述

2 个解决方案

解决方案1 19 2019-04-26 18:35:28

解决方案2 1 2019-04-28 14:33:26

解决方案1
19 2019-04-26 18:35:28

解决方案2
1 2019-04-28 14:33:26