blob: c3f90cfa99773ce93eeb421553f809bf00b8d673 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
#include <ctype.h>
#include "utils.h"
bool skip_valid_ansi_csi_sgr(char*& str)
{
if (*str++ != '\x1B')
return false;
if (*str++ != '[') // CSI
return false;
for (char* c = str; *c; c++)
{
if (*c >= '0' && *c <= '9')
continue;
if (*c == ';' || *c == ':')
continue;
if (*c == 'm') // SGR
break;
return false;
}
return true;
}
void RemoveAsciiControlSequences(char* str, bool allow_color_codes)
{
for (char *pc = str, c = *pc; c = *pc; pc++)
{
// skip UTF-8 characters
int bytesToSkip = 0;
if ((c & 0xE0) == 0xC0)
bytesToSkip = 1; // skip 2-byte UTF-8 sequence
else if ((c & 0xF0) == 0xE0)
bytesToSkip = 2; // skip 3-byte UTF-8 sequence
else if ((c & 0xF8) == 0xF0)
bytesToSkip = 3; // skip 4-byte UTF-8 sequence
else if ((c & 0xFC) == 0xF8)
bytesToSkip = 4; // skip 5-byte UTF-8 sequence
else if ((c & 0xFE) == 0xFC)
bytesToSkip = 5; // skip 6-byte UTF-8 sequence
bool invalid = false;
char* orgpc = pc;
for (int i = 0; i < bytesToSkip; i++)
{
char next = pc[1];
// valid UTF-8 part
if ((next & 0xC0) == 0x80)
{
pc++;
continue;
}
// invalid UTF-8 part or encountered \0
invalid = true;
break;
}
if (invalid)
{
// erase the whole "UTF-8" sequence
for (char* x = orgpc; x <= pc; x++)
if (*x != '\0')
*x = ' ';
else
break;
}
if (bytesToSkip > 0)
continue; // this byte was already handled as UTF-8
// an invalid control character or an UTF-8 part outside of UTF-8 sequence
if ((iscntrl(c) && c != '\n' && c != '\r' && c != '\x1B') || (c & 0x80) != 0)
{
*pc = ' ';
continue;
}
if (c == '\x1B') // separate handling for this escape sequence...
if (allow_color_codes && skip_valid_ansi_csi_sgr(pc)) // ...which we allow for color codes...
pc--;
else // ...but remove it otherwise
*pc = ' ';
}
}
|