1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#include <string.h>
#include <stdint.h>
#include <limits.h>
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#endif
#define SS (sizeof(size_t))
#define ALIGN (sizeof(size_t)-1)
#define ONES ((size_t)-1/UCHAR_MAX)
#define HIGHS (ONES * (UCHAR_MAX/2+1))
#define HASZERO(x) ((x)-ONES & ~(x) & HIGHS)
void *memchr(const void *src, int c, size_t n)
{
#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
// which results in an ICE when inline assembly is used with a vector result.
#if __clang_major__ != 19 && __clang_major__ != 20
// When n is zero, a function that locates a character finds no occurrence.
// Otherwise, decrement n to ensure sub_overflow overflows
// when n would go equal-to-or-below zero.
if (!n--) {
return NULL;
}
// Note that reading before/after the allocation of a pointer is UB in
// C, so inline assembly is used to generate the exact machine
// instruction we want with opaque semantics to the compiler to avoid
// the UB.
uintptr_t align = (uintptr_t)src % sizeof(v128_t);
uintptr_t addr = (uintptr_t)src - align;
v128_t vc = wasm_i8x16_splat(c);
for (;;) {
v128_t v;
__asm__ (
"local.get %1\n"
"v128.load 0\n"
"local.set %0\n"
: "=r"(v)
: "r"(addr)
: "memory");
v128_t cmp = wasm_i8x16_eq(v, vc);
// Bitmask is slow on AArch64, any_true is much faster.
if (wasm_v128_any_true(cmp)) {
// Clear the bits corresponding to align (little-endian)
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
// At least one bit will be set, unless align cleared them.
// Knowing this helps the compiler if it unrolls the loop.
__builtin_assume(mask || align);
// If the mask became zero because of align,
// it's as if we didn't find anything.
if (mask) {
// Find the offset of the first one bit (little-endian).
// That's a match, unless it is beyond the end of the object.
// Recall that we decremented n, so less-than-or-equal-to is correct.
size_t ctz = __builtin_ctz(mask);
return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
: NULL;
}
}
// Decrement n; if it overflows we're done.
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
return NULL;
}
align = 0;
addr += sizeof(v128_t);
}
#endif
#endif
const unsigned char *s = src;
c = (unsigned char)c;
#ifdef __GNUC__
for (; ((uintptr_t)s & ALIGN) && n && *s != c; s++, n--);
if (n && *s != c) {
typedef size_t __attribute__((__may_alias__)) word;
const word *w;
size_t k = ONES * c;
for (w = (const void *)s; n>=SS && !HASZERO(*w^k); w++, n-=SS);
s = (const void *)w;
}
#endif
for (; n && *s != c; s++, n--);
return n ? (void *)s : 0;
}
|